From 9adb664802ed362671fafa3ab6dadf2af7741211 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 20 Jan 2022 08:57:39 +0200
Subject: [PATCH 01/76] Added features from latest YOLO versions

* Supports several algorithms for matching targets to anchors.
* Added support for DIoU and CIoU losses.
* Target for confidence can be selected between 1.0 and the overlap between the target and the predicted box.
* Target classes may be specified as a matrix of class probabilities, allowing multiple classes per object.
* Fixed rounding of maxpool layer size.
* Weight decay is applied only to convolutional layer weights.
* Command line interface is now using LightningCLI.
---
 pl_bolts/models/detection/__init__.py         |   4 +-
 ...olo_config.py => darknet_configuration.py} | 144 ++--
 .../models/detection/yolo/target_matching.py  | 319 +++++++++
 pl_bolts/models/detection/yolo/utils.py       | 120 ++++
 pl_bolts/models/detection/yolo/yolo_layers.py | 456 +++---------
 pl_bolts/models/detection/yolo/yolo_loss.py   | 255 +++++++
 pl_bolts/models/detection/yolo/yolo_module.py | 674 ++++++++----------
 tests/models/test_detection.py                | 143 +++-
 8 files changed, 1305 insertions(+), 810 deletions(-)
 rename pl_bolts/models/detection/yolo/{yolo_config.py => darknet_configuration.py} (61%)
 create mode 100644 pl_bolts/models/detection/yolo/target_matching.py
 create mode 100644 pl_bolts/models/detection/yolo/utils.py
 create mode 100644 pl_bolts/models/detection/yolo/yolo_loss.py

diff --git a/pl_bolts/models/detection/__init__.py b/pl_bolts/models/detection/__init__.py
index db5525adbc..8defadf410 100644
--- a/pl_bolts/models/detection/__init__.py
+++ b/pl_bolts/models/detection/__init__.py
@@ -1,13 +1,13 @@
 from pl_bolts.models.detection import components
 from pl_bolts.models.detection.faster_rcnn import FasterRCNN
 from pl_bolts.models.detection.retinanet import RetinaNet
-from pl_bolts.models.detection.yolo.yolo_config import YOLOConfiguration
+from pl_bolts.models.detection.yolo.darknet_configuration import DarknetConfiguration
 from pl_bolts.models.detection.yolo.yolo_module import YOLO
 
 __all__ = [
     "components",
     "FasterRCNN",
-    "YOLOConfiguration",
+    "DarknetConfiguration",
     "YOLO",
     "RetinaNet",
 ]
diff --git a/pl_bolts/models/detection/yolo/yolo_config.py b/pl_bolts/models/detection/yolo/darknet_configuration.py
similarity index 61%
rename from pl_bolts/models/detection/yolo/yolo_config.py
rename to pl_bolts/models/detection/yolo/darknet_configuration.py
index fea56df7e8..95064034b8 100644
--- a/pl_bolts/models/detection/yolo/yolo_config.py
+++ b/pl_bolts/models/detection/yolo/darknet_configuration.py
@@ -1,14 +1,21 @@
 import re
-from typing import Any, Dict, Iterable, List, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 from warnings import warn
 
 import torch.nn as nn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 from pl_bolts.models.detection.yolo import yolo_layers
+from pl_bolts.models.detection.yolo.target_matching import (
+    HighestIoUMatching,
+    IoUThresholdMatching,
+    SimOTAMatching,
+    SizeRatioMatching,
+)
+from pl_bolts.models.detection.yolo.yolo_loss import LossFunction
 
 
-class YOLOConfiguration:
+class DarknetConfiguration:
     """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation.
 
     The :func:`~pl_bolts.models.detection.yolo.yolo_config.YOLOConfiguration.get_network` method
@@ -32,7 +39,7 @@ def __init__(self, path: str) -> None:
         self.global_config = sections[0]
         self.layer_configs = sections[1:]
 
-    def get_network(self) -> nn.ModuleList:
+    def get_network(self, **kwargs) -> nn.ModuleList:
         """Iterates through the layers from the configuration and creates corresponding PyTorch modules. Returns
         the network structure that can be used to create a YOLO model.
 
@@ -43,7 +50,7 @@ def get_network(self) -> nn.ModuleList:
         num_inputs = [3]  # Number of channels in the input of every layer up to the current layer
         for layer_config in self.layer_configs:
             config = {**self.global_config, **layer_config}
-            module, num_outputs = _create_layer(config, num_inputs)
+            module, num_outputs = _create_layer(config, num_inputs, **kwargs)
             result.append(module)
             num_inputs.append(num_outputs)
         return result
@@ -145,7 +152,7 @@ def convert(key, value):
         return sections
 
 
-def _create_layer(config: dict, num_inputs: List[int]) -> Tuple[nn.Module, int]:
+def _create_layer(config: Dict[str, Any], num_inputs: List[int], **kwargs) -> Tuple[nn.Module, int]:
     """Calls one of the ``_create_<layertype>(config, num_inputs)`` functions to create a PyTorch module from the
     layer config.
 
@@ -165,11 +172,11 @@ def _create_layer(config: dict, num_inputs: List[int]) -> Tuple[nn.Module, int]:
         "upsample": _create_upsample,
         "yolo": _create_yolo,
     }
-    return create_func[config["type"]](config, num_inputs)
+    return create_func[config["type"]](config, num_inputs, **kwargs)
 
 
-def _create_convolutional(config, num_inputs):
-    module = nn.Sequential()
+def _create_convolutional(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+    layer = nn.Sequential()
 
     batch_normalize = config.get("batch_normalize", False)
     padding = (config["size"] - 1) // 2 if config["pad"] else 0
@@ -177,40 +184,53 @@ def _create_convolutional(config, num_inputs):
     conv = nn.Conv2d(
         num_inputs[-1], config["filters"], config["size"], config["stride"], padding, bias=not batch_normalize
     )
-    module.add_module("conv", conv)
+    layer.add_module("conv", conv)
 
     if batch_normalize:
-        bn = nn.BatchNorm2d(config["filters"])
-        module.add_module("bn", bn)
+        bn = nn.BatchNorm2d(config["filters"])  # YOLOv5: eps=0.001, momentum=0.03
+        layer.add_module("bn", bn)
 
     activation_name = config["activation"]
     if activation_name == "leaky":
         leakyrelu = nn.LeakyReLU(0.1, inplace=True)
-        module.add_module("leakyrelu", leakyrelu)
+        layer.add_module("leakyrelu", leakyrelu)
     elif activation_name == "mish":
         mish = yolo_layers.Mish()
-        module.add_module("mish", mish)
+        layer.add_module("mish", mish)
     elif activation_name == "swish":
         swish = nn.SiLU(inplace=True)
-        module.add_module("swish", swish)
+        layer.add_module("swish", swish)
     elif activation_name == "logistic":
         logistic = nn.Sigmoid()
-        module.add_module("logistic", logistic)
+        layer.add_module("logistic", logistic)
     elif activation_name == "linear":
         pass
     else:
-        raise ValueError("Unknown activation: " + activation_name)
+        raise MisconfigurationException("Unknown activation: " + activation_name)
 
-    return module, config["filters"]
+    return layer, config["filters"]
 
 
-def _create_maxpool(config, num_inputs):
-    padding = (config["size"] - 1) // 2
-    module = nn.MaxPool2d(config["size"], config["stride"], padding)
-    return module, num_inputs[-1]
+def _create_maxpool(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+    """Creates a max pooling layer.
+
+    Padding is added so that the output resolution will be the input resolution divided by stride, rounded upwards.
+    """
+    kernel_size = config["size"]
+    padding = (kernel_size - 1) // 2
+    maxpool = nn.MaxPool2d(kernel_size, config["stride"], padding)
+    if kernel_size % 2 == 1:
+        return maxpool, num_inputs[-1]
+
+    # If the kernel size is an even number, we need one cell of extra padding, on top of the padding
+    # added by MaxPool2d on both sides.
+    layer = nn.Sequential()
+    layer.add_module("pad", nn.ZeroPad2d((0, 1, 0, 1)))
+    layer.add_module("maxpool", maxpool)
+    return layer, num_inputs[-1]
 
 
-def _create_route(config, num_inputs):
+def _create_route(config, num_inputs: List[int], **kwargs):
     num_chunks = config.get("groups", 1)
     chunk_idx = config.get("group_id", 0)
 
@@ -218,56 +238,76 @@ def _create_route(config, num_inputs):
     last = len(num_inputs) - 1
     source_layers = [layer if layer >= 0 else last + layer for layer in config["layers"]]
 
-    module = yolo_layers.RouteLayer(source_layers, num_chunks, chunk_idx)
+    layer = yolo_layers.RouteLayer(source_layers, num_chunks, chunk_idx)
 
     # The number of outputs of a source layer is the number of inputs of the next layer.
     num_outputs = sum(num_inputs[layer + 1] // num_chunks for layer in source_layers)
 
-    return module, num_outputs
+    return layer, num_outputs
 
 
-def _create_shortcut(config, num_inputs):
-    module = yolo_layers.ShortcutLayer(config["from"])
-    return module, num_inputs[-1]
+def _create_shortcut(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+    layer = yolo_layers.ShortcutLayer(config["from"])
+    return layer, num_inputs[-1]
 
 
-def _create_upsample(config, num_inputs):
-    module = nn.Upsample(scale_factor=config["stride"], mode="nearest")
-    return module, num_inputs[-1]
+def _create_upsample(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+    layer = nn.Upsample(scale_factor=config["stride"], mode="nearest")
+    return layer, num_inputs[-1]
 
 
-def _create_yolo(config, num_inputs):
+def _create_yolo(
+    config: Dict[str, Any],
+    num_inputs: List[int],
+    match_sim_ota: bool = False,
+    match_size_ratio: Optional[float] = None,
+    match_iou_threshold: Optional[float] = None,
+    ignore_iou_threshold: Optional[float] = None,
+    overlap_loss: Optional[Union[str, Callable]] = None,
+    predict_overlap: Optional[float] = None,
+    overlap_loss_multiplier: Optional[float] = None,
+    class_loss_multiplier: Optional[float] = None,
+    confidence_loss_multiplier: Optional[float] = None,
+    **kwargs,
+):
     # The "anchors" list alternates width and height.
     anchor_dims = config["anchors"]
     anchor_dims = [(anchor_dims[i], anchor_dims[i + 1]) for i in range(0, len(anchor_dims), 2)]
+    anchor_ids = config["mask"]
 
     xy_scale = config.get("scale_x_y", 1.0)
     input_is_normalized = config.get("new_coords", 0) > 0
-    ignore_threshold = config.get("ignore_thresh", 1.0)
-    overlap_loss_multiplier = config.get("iou_normalizer", 1.0)
-    class_loss_multiplier = config.get("cls_normalizer", 1.0)
-    confidence_loss_multiplier = config.get("obj_normalizer", 1.0)
-
-    overlap_loss_name = config.get("iou_loss", "mse")
-    if overlap_loss_name == "mse":
-        overlap_loss_func = yolo_layers.SELoss()
-    elif overlap_loss_name == "giou":
-        overlap_loss_func = yolo_layers.GIoULoss()
+    ignore_iou_threshold = config.get("ignore_thresh", 1.0) if ignore_iou_threshold is None else ignore_iou_threshold
+
+    overlap_loss = overlap_loss or config.get("iou_loss", "iou")
+    if overlap_loss_multiplier is None:
+        overlap_loss_multiplier = config.get("iou_normalizer", 1.0)
+    if class_loss_multiplier is None:
+        class_loss_multiplier = config.get("cls_normalizer", 1.0)
+    if confidence_loss_multiplier is None:
+        confidence_loss_multiplier = config.get("obj_normalizer", 1.0)
+    loss_func = LossFunction(
+        overlap_loss, predict_overlap, overlap_loss_multiplier, class_loss_multiplier, confidence_loss_multiplier
+    )
+
+    if sum(var is not None for var in (match_sim_ota, match_size_ratio, match_iou_threshold)) > 1:
+        raise ValueError("More than one matching algorithm specified.")
+    if match_sim_ota:
+        matching_func = SimOTAMatching(loss_func)
+    elif match_size_ratio is not None:
+        matching_func = SizeRatioMatching(match_size_ratio, anchor_dims, anchor_ids, ignore_iou_threshold)
+    elif match_iou_threshold is not None:
+        matching_func = IoUThresholdMatching(match_iou_threshold, anchor_dims, anchor_ids, ignore_iou_threshold)
     else:
-        overlap_loss_func = yolo_layers.IoULoss()
+        matching_func = HighestIoUMatching(anchor_dims, anchor_ids, ignore_iou_threshold)
 
-    module = yolo_layers.DetectionLayer(
+    layer = yolo_layers.DetectionLayer(
         num_classes=config["classes"],
-        anchor_dims=anchor_dims,
-        anchor_ids=config["mask"],
+        anchor_dims=[anchor_dims[i] for i in anchor_ids],
+        matching_func=matching_func,
+        loss_func=loss_func,
         xy_scale=xy_scale,
         input_is_normalized=input_is_normalized,
-        ignore_threshold=ignore_threshold,
-        overlap_loss_func=overlap_loss_func,
-        image_space_loss=overlap_loss_name != "mse",
-        overlap_loss_multiplier=overlap_loss_multiplier,
-        class_loss_multiplier=class_loss_multiplier,
-        confidence_loss_multiplier=confidence_loss_multiplier,
     )
 
-    return module, num_inputs[-1]
+    return layer, num_inputs[-1]
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
new file mode 100644
index 0000000000..0181ff4a97
--- /dev/null
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -0,0 +1,319 @@
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple
+
+import torch
+from torch import Tensor
+
+from pl_bolts.models.detection.yolo.utils import aligned_iou, grid_centers, iou_below, is_inside_box
+from pl_bolts.models.detection.yolo.yolo_loss import LossFunction
+from pl_bolts.utils import _TORCHVISION_AVAILABLE
+from pl_bolts.utils.warnings import warn_missing_pkg
+
+if _TORCHVISION_AVAILABLE:
+    from torchvision.ops import box_convert
+else:
+    warn_missing_pkg("torchvision")
+
+
+class ShapeMatching(ABC):
+    """Selects which anchors are used to predict each target, by comparing the shape of the target box to a set of
+    prior shapes.
+
+    Most YOLO variants match targets to anchors based on prior shapes that are assigned to the anchors in the model
+    configuration. The subclasses of ``ShapeMatching`` implement matching rules that compare the width and height of
+    the targets to each prior shape, regardless of the grid cell where the target is. When the model includes multiple
+    detection layers, different shapes are defined for each layer. Usually there are three detection layers and three
+    prior shapes per layer.
+
+    Args:
+        anchor_dims: A list of all the prior shapes. The list should contain (width, height) tuples in the network input
+            resolution (relative to the width and height defined in the configuration file).
+        anchor_ids: List of indices to ``anchor_dims`` that is used to select the (usually 3) prior shapes that this
+            layer uses.
+        ignore_iou_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+    """
+
+    def __init__(self, anchor_dims: List[Tuple[int, int]], anchor_ids: List[int], ignore_iou_threshold: float = 0.7):
+        self.anchor_dims = anchor_dims
+        # anchor_map maps the anchor indices to predictors in this layer, or to -1 if it's not an anchor of this layer.
+        # This layer ignores the target if all the selected anchors are in another layer.
+        self.anchor_map = [anchor_ids.index(i) if i in anchor_ids else -1 for i in range(len(anchor_dims))]
+        self.ignore_iou_threshold = ignore_iou_threshold
+
+    def __call__(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        image_size: Tensor,
+    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
+        """For each target, selects predictions from the same grid cell, where the center of the target box is.
+        Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
+        predictions within the grid cell.
+
+        Args:
+            preds: Predictions for a single image.
+            targets: Training targets for a single image.
+            image_size: Input image width and height.
+
+        Returns:
+            preds, targets: Two dictionaries that contain the matched predictions and targets.
+        """
+        height, width = preds["boxes"].shape[:2]
+        device = preds["boxes"].device
+
+        # A multiplier for scaling image coordinates to feature map coordinates
+        grid_size = torch.tensor([width, height], device=device)
+        image_to_grid = torch.true_divide(grid_size, image_size)
+
+        # Bounding box center coordinates are converted to the feature map dimensions so that the whole number tells the
+        # cell index and the fractional part tells the location inside the cell.
+        xywh = box_convert(targets["boxes"], in_fmt="xyxy", out_fmt="cxcywh")
+        grid_xy = xywh[:, :2] * image_to_grid
+        cell_i = grid_xy[:, 0].to(torch.int64).clamp(0, width - 1)
+        cell_j = grid_xy[:, 1].to(torch.int64).clamp(0, height - 1)
+
+        matched_targets, matched_predictors = self.match(xywh[:, 2:])
+        cell_i = cell_i[matched_targets]
+        cell_j = cell_j[matched_targets]
+
+        # Background mask is used to select predictors that are not responsible for predicting any object, for
+        # calculating the part of the confidence loss with zero as the target confidence. It is set to False, if a
+        # predicted box overlaps any target significantly, or if a prediction is matched to a target.
+        background_mask = iou_below(preds["boxes"], targets["boxes"], self.ignore_iou_threshold)
+        background_mask[cell_j, cell_i, matched_predictors] = False
+
+        preds = {
+            "boxes": preds["boxes"][cell_j, cell_i, matched_predictors],
+            "confidences": preds["confidences"][cell_j, cell_i, matched_predictors],
+            "bg_confidences": preds["confidences"][background_mask],
+            "classprobs": preds["classprobs"][cell_j, cell_i, matched_predictors],
+        }
+        targets = {
+            "boxes": targets["boxes"][matched_targets],
+            "labels": targets["labels"][matched_targets],
+        }
+        return preds, targets
+
+    @abstractmethod
+    def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
+        """Selects anchors for each target based on the predicted shapes. The subclasses implement this method.
+
+        Args:
+            wh: A matrix of predicted width and height values.
+
+        Returns:
+            matched_targets, matched_predictors: A vector that can be used to select the targets that this layer
+            matched and a vector that lists the matching predictors within the grid cell.
+        """
+        pass
+
+
+class HighestIoUMatching(ShapeMatching):
+    """For each target, select the prior shape that gives the highest IoU.
+
+    This is the original YOLO matching rule.
+    """
+
+    def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
+        anchor_wh = torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
+        anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=wh.device)
+
+        ious = aligned_iou(wh, anchor_wh)
+        highest_iou_anchors = ious.max(1).indices
+        highest_iou_anchors = anchor_map[highest_iou_anchors]
+        matched_targets = highest_iou_anchors >= 0
+        matched_anchors = highest_iou_anchors[matched_targets]
+        return matched_targets, matched_anchors
+
+
+class IoUThresholdMatching(ShapeMatching):
+    """For each target, select all prior shapes that give a high enough IoU.
+
+    Args:
+        threshold: IoU treshold for matching.
+        anchor_dims: A list of all the prior shapes. The list should contain (width, height) tuples in the network input
+            resolution (relative to the width and height defined in the configuration file).
+        anchor_ids: List of indices to ``anchor_dims`` that is used to select the (usually 3) prior shapes that this
+            layer uses.
+        ignore_iou_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+    """
+
+    def __init__(self, threshold, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.threshold = threshold
+
+    def match(self, wh):
+        anchor_wh = torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
+        anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=wh.device)
+
+        ious = aligned_iou(wh, anchor_wh)
+        above_threshold = (ious > self.threshold).nonzero()
+        targets_above_threshold = above_threshold[:, 0]
+        anchors_above_threshold = above_threshold[:, 1]
+        anchors_above_threshold = anchor_map[anchors_above_threshold]
+        local = anchors_above_threshold >= 0
+        matched_targets = targets_above_threshold[local]
+        matched_anchors = anchors_above_threshold[local]
+        return matched_targets, matched_anchors
+
+
+class SizeRatioMatching(ShapeMatching):
+    """For each target, select those prior shapes, whose width and height relative to the target is below given
+    ratio.
+
+    This is the matching rule used by Ultralytics YOLOv5 implementation.
+
+    Args:
+        threshold: Size ratio threshold for matching.
+        anchor_dims: A list of all the prior shapes. The list should contain (width, height) tuples in the network input
+            resolution (relative to the width and height defined in the configuration file).
+        anchor_ids: List of indices to ``anchor_dims`` that is used to select the (usually 3) prior shapes that this
+            layer uses.
+        ignore_iou_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+    """
+
+    def __init__(self, threshold, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.threshold = threshold
+
+    def match(self, wh):
+        anchor_wh = torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
+        anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=wh.device)
+
+        wh_ratio = wh[:, None, :] / anchor_wh[None, :, :]  # [num_targets, num_anchors, 2]
+        wh_ratio = torch.max(wh_ratio, 1.0 / wh_ratio)
+        wh_ratio = wh_ratio.max(2).values  # [num_targets, num_anchors]
+        below_threshold = (wh_ratio < self.threshold).nonzero()
+        targets_below_threshold = below_threshold[:, 0]
+        anchors_below_threshold = below_threshold[:, 1]
+        anchors_below_threshold = anchor_map[anchors_below_threshold]
+        local = anchors_below_threshold >= 0
+        matched_targets = targets_below_threshold[local]
+        matched_anchors = anchors_below_threshold[local]
+        return matched_targets, matched_anchors
+
+
+def _sim_ota_match(costs, ious):
+    """Implements the SimOTA matching rule.
+
+    The number of units supplied by each supplier (training target) needs to be decided in the Optimal Transport
+    problem. "Dynamic k Estimation" uses the sum of the top 10 IoU values (casted to int) between the target and the
+    predicted boxes.
+
+    Args:
+        costs: Sum of losses for (prediction, target) pairs: ``[targets, predictions]``
+        ious: IoUs for (prediction, target) pairs: ``[targets, predictions]``
+
+    Returns:
+        A mask of predictions that were matched, and the indices of the matched targets. The latter contains as many
+        elements as there are ``True`` values in the mask.
+    """
+    matching_matrix = torch.zeros_like(costs, dtype=torch.bool)
+
+    if ious.numel() > 0:
+        # For each target, define k as the sum of the 10 highest IoUs.
+        top10_iou = torch.topk(ious, min(10, ious.shape[1])).values.sum(1)
+        ks = torch.clip(top10_iou.int(), min=1)
+
+        # For each target, select k predictions with lowest cost.
+        for target_idx, (cost, k) in enumerate(zip(costs, ks)):
+            prediction_idx = torch.topk(cost, k, largest=False).indices
+            matching_matrix[target_idx, prediction_idx] = True
+
+        # If there's more than one match for some prediction, match it with the best target. Now we consider all
+        # targets, regardless of whether they were originally matched with the prediction or not.
+        more_than_one_match = matching_matrix.sum(0) > 1
+        best_targets = costs[:, more_than_one_match].argmin(0)
+        matching_matrix[:, more_than_one_match] = False
+        matching_matrix[best_targets, more_than_one_match] = True
+
+    # For those predictions that were matched, get the index of the target.
+    matched_preds = matching_matrix.sum(0) > 0
+    matched_targets = matching_matrix[:, matched_preds].int().argmax(0)
+    return matched_preds, matched_targets
+
+
+class SimOTAMatching:
+    """Selects which anchors are used to predict each target using the SimOTA matching rule.
+
+    This is the matching rule used by YOLOX.
+
+    Args:
+        loss_func: A ``LossFunction`` object that can be used to calculate the pairwise costs.
+    """
+
+    def __init__(self, loss_func: LossFunction):
+        self.loss_func = loss_func
+
+    def __call__(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        image_size: Tensor,
+    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
+        """For each target, selects predictions using the SimOTA matching rule.
+
+        Args:
+            preds: Predictions for a single image.
+            targets: Training targets for a single image.
+            image_size: Input image width and height.
+
+        Returns:
+            preds, targets: Two dictionaries that contain the matched predictions and targets.
+        """
+        height, width, boxes_per_cell, num_classes = preds["classprobs"].shape
+        device = preds["boxes"].device
+
+        # A multiplier for scaling feature map coordinates to image coordinates
+        grid_size = torch.tensor([width, height], device=device)
+        grid_to_image = torch.true_divide(image_size, grid_size)
+
+        # Create a matrix for selecting the anchors that are inside the target bounding boxes.
+        centers = grid_centers(grid_size).view(-1, 2) * grid_to_image
+        inside_matrix = is_inside_box(centers, targets["boxes"])
+
+        # Set the width and height of all target bounding boxes to the size of 5 grid cells and create a matrix for
+        # selecting the anchors that are now inside the boxes. If a small target has no anchors inside its bounding
+        # box, it will be matched to one of these anchors, but a high penalty will ensure that anchors that are inside
+        # the bounding box will be preferred.
+        xywh = box_convert(targets["boxes"], in_fmt="xyxy", out_fmt="cxcywh")
+        xy = xywh[:, :2]
+        wh = 5.0 * grid_to_image * torch.ones_like(xy)
+        xywh = torch.cat((xy, wh), -1)
+        boxes = box_convert(xywh, in_fmt="cxcywh", out_fmt="xyxy")
+        close_matrix = is_inside_box(centers, boxes)
+
+        # Flatten the prediction grids and filter them using a [height*width] boolean vector that indicates whether a
+        # cell center is inside or close enough to one or more targets.
+        fg_mask = (inside_matrix | close_matrix).sum(0) > 0
+        bg_mask = torch.logical_not(fg_mask)
+        shape = (height * width, boxes_per_cell)
+        fg_preds = {
+            "boxes": preds["boxes"].view(*shape, 4)[fg_mask].view(-1, 4),
+            "confidences": preds["confidences"].view(shape)[fg_mask].view(-1),
+            "classprobs": preds["classprobs"].view(*shape, num_classes)[fg_mask].view(-1, num_classes),
+        }
+        bg_confidences = preds["confidences"].view(shape)[bg_mask].view(-1)
+
+        self.loss_func(fg_preds, targets, input_is_normalized=False)
+        costs = self.loss_func.overlap_loss + self.loss_func.confidence_loss + self.loss_func.class_loss
+        costs += 100000.0 * ~inside_matrix[:, fg_mask].repeat_interleave(boxes_per_cell, 1)
+        matched_preds, matched_targets = _sim_ota_match(costs, self.loss_func.overlap)
+
+        preds = {
+            "boxes": fg_preds["boxes"][matched_preds],
+            "confidences": fg_preds["confidences"][matched_preds],
+            "bg_confidences": torch.cat((bg_confidences, fg_preds["confidences"][torch.logical_not(matched_preds)])),
+            "classprobs": fg_preds["classprobs"][matched_preds],
+        }
+        targets = {
+            "boxes": targets["boxes"][matched_targets],
+            "labels": targets["labels"][matched_targets],
+        }
+        return preds, targets
diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
new file mode 100644
index 0000000000..f23869c974
--- /dev/null
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -0,0 +1,120 @@
+from typing import List
+
+import torch
+from torch import Tensor
+
+from pl_bolts.utils import _TORCHVISION_AVAILABLE
+from pl_bolts.utils.warnings import warn_missing_pkg
+
+if _TORCHVISION_AVAILABLE:
+    from torchvision.ops import box_iou
+else:
+    warn_missing_pkg("torchvision")
+
+
+def grid_offsets(grid_size: Tensor) -> Tensor:
+    """Given a grid size, returns a tensor containing offsets to the grid cells.
+
+    Args:
+        The width and height of the grid in a tensor.
+
+    Returns:
+        A ``[height, width, 2]`` tensor containing the grid cell (x, y) offsets.
+    """
+    x_range = torch.arange(grid_size[0], device=grid_size.device)
+    y_range = torch.arange(grid_size[1], device=grid_size.device)
+    grid_y, grid_x = torch.meshgrid(y_range, x_range)
+    return torch.stack((grid_x, grid_y), -1)
+
+
+def grid_centers(grid_size: Tensor) -> Tensor:
+    """Given a grid size, returns a tensor containing coordinates to the centers of the grid cells.
+
+    Returns:
+        A ``[height, width, 2]`` tensor containing coordinates to the centers of the grid cells.
+    """
+    return grid_offsets(grid_size) + 0.5
+
+
+def global_xy(xy: Tensor, image_size: Tensor) -> Tensor:
+    """Adds offsets to the predicted box center coordinates to obtain global coordinates to the image.
+
+    The predicted coordinates are interpreted as coordinates inside a grid cell whose width and
+    height is 1. Adding offset to the cell, dividing by the grid size, and multiplying by the
+    image size, we get global coordinates in the image scale.
+
+    Args:
+        xy: The predicted center coordinates before scaling. Values from zero to one in a
+            tensor sized ``[batch_size, height, width, boxes_per_cell, 2]``.
+        image_size: Width and height in a vector that will be used to scale the coordinates.
+
+    Returns:
+        Global coordinates scaled to the size of the network input image, in a tensor with the
+        same shape as the input tensor.
+    """
+    height = xy.shape[1]
+    width = xy.shape[2]
+    grid_size = torch.tensor([width, height], device=xy.device)
+    offset = grid_offsets(grid_size).unsqueeze(2)  # [height, width, 1, 2]
+    scale = torch.true_divide(image_size, grid_size)
+    return (xy + offset) * scale
+
+
+def aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
+    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at
+    the same coordinates.
+
+    Args:
+        dims1: Width and height of `N` boxes. Tensor of size ``[N, 2]``.
+        dims2: Width and height of `M` boxes. Tensor of size ``[M, 2]``.
+
+    Returns:
+        Tensor of size ``[N, M]`` containing the pairwise IoU values for every element in
+        ``dims1`` and ``dims2``
+    """
+    area1 = dims1[:, 0] * dims1[:, 1]  # [N]
+    area2 = dims2[:, 0] * dims2[:, 1]  # [M]
+
+    inter_wh = torch.min(dims1[:, None, :], dims2)  # [N, M, 2]
+    inter = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # [N, M]
+    union = area1[:, None] + area2 - inter  # [N, M]
+
+    return inter / union
+
+
+def iou_below(pred_boxes: Tensor, target_boxes: Tensor, threshold: float) -> List[Tensor]:
+    """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any target
+    significantly (IoU greater than ``threshold``).
+
+    Args:
+        pred_boxes: The predicted corner coordinates. Tensor of size ``[height, width, boxes_per_cell, 4]``.
+        target_boxes: Corner coordinates of the target boxes. Tensor of size ``[height, width, boxes_per_cell, 4]``.
+
+    Returns:
+        A boolean tensor sized ``[height, width, boxes_per_cell]``, with ``False`` where the predicted box overlaps a
+        target significantly and ``True`` elsewhere.
+    """
+    shape = pred_boxes.shape[:-1]
+    pred_boxes = pred_boxes.view(-1, 4)
+    ious = box_iou(pred_boxes, target_boxes)
+    best_iou = ious.max(-1).values
+    below_threshold = best_iou <= threshold
+    return below_threshold.view(shape)
+
+
+def is_inside_box(points, boxes):
+    """Get pairwise truth values of whether the point is inside the box.
+
+    Args:
+        points: point (x, y) coordinates, [points, 2]
+        boxes: box (x1, y1, x2, y2) coordinates, [boxes, 4]
+
+    Returns:
+        A tensor shaped ``[boxes, points]`` containing pairwise truth values of whether the points are inside the boxes
+    """
+    points = points.unsqueeze(0)  # [1, points, 2]
+    boxes = boxes.unsqueeze(1)  # [boxes, 1, 4]
+    lt = points - boxes[..., :2]  # [boxes, points, 2]
+    rb = boxes[..., 2:] - points  # [boxes, points, 2]
+    deltas = torch.cat((lt, rb), -1)  # [boxes, points, 4]
+    return deltas.min(-1).values > 0.0  # [boxes, points]
diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/yolo_layers.py
index 9b1ee891df..0c8cf8771c 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/yolo_layers.py
@@ -3,84 +3,11 @@
 import torch
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import Tensor, nn
+from torchvision.ops import box_convert
 
+from pl_bolts.models.detection.yolo.utils import global_xy
+from pl_bolts.models.detection.yolo.yolo_loss import LossFunction
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
-from pl_bolts.utils.warnings import warn_missing_pkg
-
-if _TORCHVISION_AVAILABLE:
-    from torchvision.ops import box_iou
-
-    try:
-        from torchvision.ops import generalized_box_iou
-    except ImportError:
-        _GIOU_AVAILABLE = False
-    else:
-        _GIOU_AVAILABLE = True
-else:
-    warn_missing_pkg("torchvision")
-
-
-def _corner_coordinates(xy: Tensor, wh: Tensor) -> Tensor:
-    """Converts box center points and sizes to corner coordinates.
-
-    Args:
-        xy: Center coordinates. Tensor of size ``[..., 2]``.
-        wh: Width and height. Tensor of size ``[..., 2]``.
-
-    Returns:
-        A matrix of `(x1, y1, x2, y2)` coordinates.
-    """
-    half_wh = wh / 2
-    top_left = xy - half_wh
-    bottom_right = xy + half_wh
-    return torch.cat((top_left, bottom_right), -1)
-
-
-def _aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
-    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at
-    the same coordinates.
-
-    Args:
-        dims1: Width and height of `N` boxes. Tensor of size ``[N, 2]``.
-        dims2: Width and height of `M` boxes. Tensor of size ``[M, 2]``.
-
-    Returns:
-        Tensor of size ``[N, M]`` containing the pairwise IoU values for every element in
-        ``dims1`` and ``dims2``
-    """
-    area1 = dims1[:, 0] * dims1[:, 1]  # [N]
-    area2 = dims2[:, 0] * dims2[:, 1]  # [M]
-
-    inter_wh = torch.min(dims1[:, None, :], dims2)  # [N, M, 2]
-    inter = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # [N, M]
-    union = area1[:, None] + area2 - inter  # [N, M]
-
-    return inter / union
-
-
-class SELoss(nn.MSELoss):
-    def __init__(self):
-        super().__init__(reduction="none")
-
-    def forward(self, inputs: Tensor, target: Tensor) -> Tensor:
-        return super().forward(inputs, target).sum(1)
-
-
-class IoULoss(nn.Module):
-    def forward(self, inputs: Tensor, target: Tensor) -> Tensor:
-        return 1.0 - box_iou(inputs, target).diagonal()
-
-
-class GIoULoss(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        if not _GIOU_AVAILABLE:
-            raise ModuleNotFoundError(  # pragma: no-cover
-                "A more recent version of `torchvision` is needed for generalized IoU loss."
-            )
-
-    def forward(self, inputs: Tensor, target: Tensor) -> Tensor:
-        return 1.0 - generalized_box_iou(inputs, target).diagonal()
 
 
 class DetectionLayer(nn.Module):
@@ -94,49 +21,25 @@ def __init__(
         self,
         num_classes: int,
         anchor_dims: List[Tuple[int, int]],
-        anchor_ids: List[int],
+        matching_func: Callable,
+        loss_func: LossFunction,
         xy_scale: float = 1.0,
         input_is_normalized: bool = False,
-        ignore_threshold: float = 0.5,
-        overlap_loss_func: Optional[Callable] = None,
-        class_loss_func: Optional[Callable] = None,
-        confidence_loss_func: Optional[Callable] = None,
-        image_space_loss: bool = False,
-        overlap_loss_multiplier: float = 1.0,
-        class_loss_multiplier: float = 1.0,
-        confidence_loss_multiplier: float = 1.0,
     ) -> None:
         """
         Args:
             num_classes: Number of different classes that this layer predicts.
-            anchor_dims: A list of all the predefined anchor box dimensions. The list should
+            anchor_dims: A list of the anchor box dimensions for this layer. The list should
                 contain (width, height) tuples in the network input resolution (relative to the
                 width and height defined in the configuration file).
-            anchor_ids: List of indices to ``anchor_dims`` that is used to select the (usually 3)
-                anchors that this layer uses.
+            matching_func: The matching algorithm to be used for assigning targets to anchors.
+            loss_func: ``LossFunction`` object for calculating the losses.
             xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor.
                 Using a value > 1.0 helps to produce coordinate values close to one.
             input_is_normalized: The input is normalized by logistic activation in the previous
                 layer. In this case the detection layer will not take the sigmoid of the coordinate
                 and probability predictions, and the width and height are scaled up so that the
                 maximum value is four times the anchor dimension
-            ignore_threshold: If a predictor is not responsible for predicting any target, but the
-                corresponding anchor has IoU with some target greater than this threshold, the
-                predictor will not be taken into account when calculating the confidence loss.
-            overlap_loss_func: Loss function for bounding box coordinates. Default is the sum of
-                squared errors.
-            class_loss_func: Loss function for class probability distribution. Default is the sum
-                of squared errors.
-            confidence_loss_func: Loss function for confidence score. Default is the sum of squared
-                errors.
-            image_space_loss: If set to ``True``, the overlap loss function will receive the
-                bounding box `(x1, y1, x2, y2)` coordinates, scaled to the input image size. This is
-                needed for the IoU losses introduced in YOLOv4. Otherwise the loss will be computed
-                from the x, y, width, and height values, as predicted by the network (i.e. relative
-                to the anchor box, and width and height are logarithmic).
-            overlap_loss_multiplier: Multiply the overlap loss by this factor.
-            class_loss_multiplier: Multiply the classification loss by this factor.
-            confidence_loss_multiplier: Multiply the confidence loss by this factor.
         """
         super().__init__()
 
@@ -144,47 +47,33 @@ def __init__(
             raise ModuleNotFoundError("YOLO model uses `torchvision`, which is not installed yet.")
 
         self.num_classes = num_classes
-        self.all_anchor_dims = anchor_dims
-        self.anchor_dims = [anchor_dims[i] for i in anchor_ids]
-        self.anchor_map = [anchor_ids.index(i) if i in anchor_ids else -1 for i in range(len(anchor_dims))]
+        self.anchor_dims = anchor_dims
+        self.matching_func = matching_func
+        self.loss_func = loss_func
         self.xy_scale = xy_scale
         self.input_is_normalized = input_is_normalized
-        self.ignore_threshold = ignore_threshold
-
-        self.overlap_loss_func = overlap_loss_func or SELoss()
-        self.class_loss_func = class_loss_func or SELoss()
-        self.confidence_loss_func = confidence_loss_func or nn.MSELoss(reduction="none")
-        self.image_space_loss = image_space_loss
-        self.overlap_loss_multiplier = overlap_loss_multiplier
-        self.class_loss_multiplier = class_loss_multiplier
-        self.confidence_loss_multiplier = confidence_loss_multiplier
-
-    def forward(
-        self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None
-    ) -> Tuple[Tensor, Dict[str, Tensor]]:
+
+    def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tensor:
         """Runs a forward pass through this YOLO detection layer.
 
-        Maps cell-local coordinates to global coordinates in the image space, scales the bounding
-        boxes with the anchors, converts the center coordinates to corner coordinates, and maps
-        probabilities to the `]0, 1[` range using sigmoid.
+        Maps cell-local coordinates to global coordinates in the image space, scales the bounding boxes with the
+        anchors, converts the center coordinates to corner coordinates, and maps probabilities to the `]0, 1[` range
+        using sigmoid.
 
-        If targets are given, computes also losses from the predictions and the targets. This layer
-        is responsible only for the targets that best match one of the anchors assigned to this
-        layer.
+        If targets are given, computes also losses from the predictions and the targets. This layer is responsible only
+        for the targets that best match one of the anchors assigned to this layer. Training losses will be saved to the
+        ``losses`` attribute. ``hits`` attribute will be set to the number of targets that this layer was responsible
+        for. ``losses`` is a tensor of three elements: the overlap, confidence, and classification loss.
 
         Args:
             x: The output from the previous layer. Tensor of size
                 ``[batch_size, boxes_per_cell * (num_classes + 5), height, width]``.
-            image_size: Image width and height in a vector (defines the scale of the predicted and
-                target coordinates).
-            targets: If set, computes losses from detection layers against these targets. A list of
-                dictionaries, one for each image.
+            image_size: Image width and height in a vector (defines the scale of the predicted and target coordinates).
+            targets: If set, computes losses from detection layers against these targets. A list of target dictionaries,
+                one for each image.
 
         Returns:
-            output (Tensor), losses (Dict[str, Tensor]), hits (int): Layer output tensor, sized
-            ``[batch_size, num_anchors * height * width, num_classes + 5]``. If training targets
-            were provided, also returns a dictionary of losses and the number of targets that this
-            layer was responsible for.
+            Layer output tensor, sized ``[batch_size, num_anchors * height * width, num_classes + 5]``.
         """
         batch_size, num_features, height, width = x.shape
         num_attrs = self.num_classes + 5
@@ -200,265 +89,90 @@ def forward(
         x = x.view(batch_size, height, width, boxes_per_cell, num_attrs)
 
         # Take the sigmoid of the bounding box coordinates, confidence score, and class
-        # probabilities, unless the input is normalized by the previous layer activation.
-        if self.input_is_normalized:
-            xy = x[..., :2]
-            confidence = x[..., 4]
-            classprob = x[..., 5:]
-        else:
-            xy = torch.sigmoid(x[..., :2])
-            confidence = torch.sigmoid(x[..., 4])
-            classprob = torch.sigmoid(x[..., 5:])
+        # probabilities, unless the input is normalized by the previous layer activation. Confidence
+        # and class losses use the unnormalized values if possible.
+        norm_x = x if self.input_is_normalized else torch.sigmoid(x)
+        xy = norm_x[..., :2]
         wh = x[..., 2:4]
+        confidence = x[..., 4]
+        classprob = x[..., 5:]
+        norm_confidence = norm_x[..., 4]
+        norm_classprob = norm_x[..., 5:]
 
         # Eliminate grid sensitivity. The previous layer should output extremely high values for
         # the sigmoid to produce x/y coordinates close to one. YOLOv4 solves this by scaling the
         # x/y coordinates.
         xy = xy * self.xy_scale - 0.5 * (self.xy_scale - 1)
 
-        image_xy = self._global_xy(xy, image_size)
+        image_xy = global_xy(xy, image_size)
         if self.input_is_normalized:
             image_wh = 4 * torch.square(wh) * torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
         else:
             image_wh = torch.exp(wh) * torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
-        boxes = _corner_coordinates(image_xy, image_wh)
-        output = torch.cat((boxes, confidence.unsqueeze(-1), classprob), -1)
+        box = torch.cat((image_xy, image_wh), -1)
+        box = box_convert(box, in_fmt="cxcywh", out_fmt="xyxy")
+        output = torch.cat((box, norm_confidence.unsqueeze(-1), norm_classprob), -1)
         output = output.reshape(batch_size, height * width * boxes_per_cell, num_attrs)
 
-        if targets is None:
-            return output
-
-        lc_mask = self._low_confidence_mask(boxes, targets)
-        if not self.image_space_loss:
-            boxes = torch.cat((xy, wh), -1)
-        losses, hits = self._calculate_losses(boxes, confidence, classprob, targets, image_size, lc_mask)
-        return output, losses, hits
-
-    def _global_xy(self, xy: Tensor, image_size: Tensor) -> Tensor:
-        """Adds offsets to the predicted box center coordinates to obtain global coordinates to the image.
-
-        The predicted coordinates are interpreted as coordinates inside a grid cell whose width and
-        height is 1. Adding offset to the cell, dividing by the grid size, and multiplying by the
-        image size, we get global coordinates in the image scale.
+        if targets is not None:
+            # We want to use binary_cross_entropy_with_logits, so we'll use the unnormalized confidence and classprob,
+            # if possible.
+            preds = [{"boxes": b, "confidences": c, "classprobs": p} for b, c, p in zip(box, confidence, classprob)]
+            self._calculate_losses(preds, targets, image_size)
 
-        Args:
-            xy: The predicted center coordinates before scaling. Values from zero to one in a
-                tensor sized ``[batch_size, height, width, boxes_per_cell, 2]``.
-            image_size: Width and height in a vector that will be used to scale the coordinates.
-
-        Returns:
-            Global coordinates scaled to the size of the network input image, in a tensor with the
-            same shape as the input tensor.
-        """
-        height = xy.shape[1]
-        width = xy.shape[2]
-        grid_size = torch.tensor([width, height], device=xy.device)
-
-        x_range = torch.arange(width, device=xy.device)
-        y_range = torch.arange(height, device=xy.device)
-        grid_y, grid_x = torch.meshgrid(y_range, x_range)
-        offset = torch.stack((grid_x, grid_y), -1)  # [height, width, 2]
-        offset = offset.unsqueeze(2)  # [height, width, 1, 2]
-
-        scale = torch.true_divide(image_size, grid_size)
-        return (xy + offset) * scale
-
-    def _low_confidence_mask(self, boxes: Tensor, targets: List[Dict[str, Tensor]]) -> Tensor:
-        """Initializes the mask that will be used to select predictors that are not predicting any ground-truth
-        target. The value will be ``True``, unless the predicted box overlaps any target significantly (IoU greater
-        than ``self.ignore_threshold``).
-
-        Args:
-            boxes: The predicted corner coordinates in the image space. Tensor of size
-                ``[batch_size, height, width, boxes_per_cell, 4]``.
-            targets: List of dictionaries of ground-truth targets, one dictionary per image.
-
-        Returns:
-            A boolean tensor shaped ``[batch_size, height, width, boxes_per_cell]`` with ``False``
-            where the predicted box overlaps a target significantly and ``True`` elsewhere.
-        """
-        batch_size, height, width, boxes_per_cell, num_coords = boxes.shape
-        num_preds = height * width * boxes_per_cell
-        boxes = boxes.view(batch_size, num_preds, num_coords)
-
-        results = torch.ones((batch_size, num_preds), dtype=torch.bool, device=boxes.device)
-        for image_idx, (image_boxes, image_targets) in enumerate(zip(boxes, targets)):
-            target_boxes = image_targets["boxes"]
-            if target_boxes.shape[0] > 0:
-                ious = box_iou(image_boxes, target_boxes)  # [num_preds, num_targets]
-                best_iou = ious.max(-1).values  # [num_preds]
-                results[image_idx] = best_iou <= self.ignore_threshold
-
-        return results.view((batch_size, height, width, boxes_per_cell))
+        return output
 
     def _calculate_losses(
         self,
-        boxes: Tensor,
-        confidence: Tensor,
-        classprob: Tensor,
+        preds: List[Dict[str, Tensor]],
         targets: List[Dict[str, Tensor]],
         image_size: Tensor,
-        lc_mask: Tensor,
-    ) -> Dict[str, Tensor]:
-        """From the targets that are in the image space calculates the actual targets for the network predictions,
-        and returns a dictionary of training losses.
+    ):
+        """Matches the predictions to targets and calculates the losses. Creates the attributes ``losses`` and
+        ``hits``. ``losses`` is a tensor of three elements: the overlap, confidence, and classification loss.
+        ``hits`` is the number of targets that this layer was responsible for.
 
         Args:
-            boxes: The predicted bounding boxes. A tensor sized
-                ``[batch_size, height, width, boxes_per_cell, 4]``.
-            confidence: The confidence predictions, normalized to `[0, 1]`. A tensor sized
-                ``[batch_size, height, width, boxes_per_cell]``.
-            classprob: The class probability predictions, normalized to `[0, 1]`. A tensor sized
-                ``[batch_size, height, width, boxes_per_cell, num_classes]``.
-            targets: List of dictionaries of target values, one dictionary for each image.
-            image_size: Width and height in a vector that defines the scale of the target
-                coordinates.
-            lc_mask: A boolean mask containing ``True`` where the predicted box does not overlap
-                any target significantly.
-
-        Returns:
-            losses (Dict[str, Tensor]), hits (int): A dictionary of training losses and the number
-            of targets that this layer was responsible for.
+            preds: List of predictions for each image.
+            targets: List of training targets for each image.
+            image_size: Width and height in a vector that defines the scale of the target coordinates.
         """
-        batch_size, height, width, boxes_per_cell, _ = boxes.shape
-        device = boxes.device
-        assert batch_size == len(targets)
-
-        # A multiplier for scaling image coordinates to feature map coordinates
-        grid_size = torch.tensor([width, height], device=device)
-        image_to_grid = torch.true_divide(grid_size, image_size)
-
-        anchor_wh = torch.tensor(self.all_anchor_dims, dtype=boxes.dtype, device=device)
-        anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=device)
-
-        # List of predicted and target values for the predictors that are responsible for
-        # predicting a target.
-        target_xy = []
-        target_wh = []
-        target_label = []
-        size_compensation = []
-        pred_boxes = []
-        pred_classprob = []
-        pred_confidence = []
-        hits = 0
-
-        for image_idx, image_targets in enumerate(targets):
-            target_boxes = image_targets["boxes"]
-            if target_boxes.shape[0] < 1:
-                continue
-
-            # Bounding box corner coordinates are converted to center coordinates, width, and
-            # height.
-            wh = target_boxes[:, 2:4] - target_boxes[:, 0:2]
-            xy = target_boxes[:, 0:2] + (wh / 2)
-
-            # The center coordinates are converted to the feature map dimensions so that the whole
-            # number tells the cell index and the fractional part tells the location inside the cell.
-            grid_xy = xy * image_to_grid
-            cell_i = grid_xy[:, 0].to(torch.int64).clamp(0, width - 1)
-            cell_j = grid_xy[:, 1].to(torch.int64).clamp(0, height - 1)
-
-            # We want to know which anchor box overlaps a ground truth box more than any other
-            # anchor box. We know that the anchor box is located in the same grid cell as the
-            # ground truth box. For each prior shape (width, height), we calculate the IoU with
-            # all ground truth boxes, assuming the boxes are at the same location. Then for each
-            # target, we select the prior shape that gives the highest IoU.
-            ious = _aligned_iou(wh, anchor_wh)
-            best_anchors = ious.max(1).indices
-
-            # ``anchor_map`` maps the anchor indices to the predictors in this layer, or to -1 if
-            # it's not an anchor of this layer. We ignore the predictions if the best anchor is in
-            # another layer.
-            predictors = anchor_map[best_anchors]
-            selected = predictors >= 0
-            cell_i = cell_i[selected]
-            cell_j = cell_j[selected]
-            predictors = predictors[selected]
-            wh = wh[selected]
-            # sum() is equivalent to count_nonzero() and available before PyTorch 1.7.
-            hits += selected.sum()
-
-            # The "low-confidence" mask is used to select predictors that are not responsible for
-            # predicting any object, for calculating the part of the confidence loss with zero as
-            # the target confidence.
-            lc_mask[image_idx, cell_j, cell_i, predictors] = False
-
-            # IoU losses are calculated from the image space coordinates. The squared-error loss is
-            # calculated from the raw predicted values.
-            if self.image_space_loss:
-                xy = xy[selected]
-                target_xy.append(xy)
-                target_wh.append(wh)
-            else:
-                grid_xy = grid_xy[selected]
-                best_anchors = best_anchors[selected]
-                relative_xy = grid_xy - grid_xy.floor()
-                if self.input_is_normalized:
-                    relative_wh = torch.sqrt(wh / (4 * anchor_wh[best_anchors] + 1e-16))
-                else:
-                    relative_wh = torch.log(wh / anchor_wh[best_anchors] + 1e-16)
-                target_xy.append(relative_xy)
-                target_wh.append(relative_wh)
-
-            # Size compensation factor for bounding box overlap loss is calculated from unit width
-            # and height.
-            unit_wh = wh / image_size
-            size_compensation.append(2 - (unit_wh[:, 0] * unit_wh[:, 1]))
-
-            # The data may contain a different number of classes than this detection layer. In case
-            # a label is greater than the number of classes that this layer predicts, it will be
-            # mapped to the last class.
-            labels = image_targets["labels"]
-            labels = labels[selected]
-            labels = torch.min(labels, torch.tensor(self.num_classes - 1, device=device))
-            target_label.append(labels)
-
-            pred_boxes.append(boxes[image_idx, cell_j, cell_i, predictors])
-            pred_classprob.append(classprob[image_idx, cell_j, cell_i, predictors])
-            pred_confidence.append(confidence[image_idx, cell_j, cell_i, predictors])
-
-        losses = dict()
-
-        if pred_boxes and target_xy and target_wh:
-            size_compensation = torch.cat(size_compensation)
-            pred_boxes = torch.cat(pred_boxes)
-            if self.image_space_loss:
-                target_boxes = _corner_coordinates(torch.cat(target_xy), torch.cat(target_wh))
+        batch_size = len(preds)
+        if batch_size != len(targets):
+            raise ValueError("Different batch size for predictions and targets.")
+
+        matches = []
+        for image_preds, image_targets in zip(preds, targets):
+            if image_targets["boxes"].shape[0] > 0:
+                matched_preds, matched_targets = self.matching_func(image_preds, image_targets, image_size)
             else:
-                target_boxes = torch.cat((torch.cat(target_xy), torch.cat(target_wh)), -1)
-            overlap_loss = self.overlap_loss_func(pred_boxes, target_boxes)
-            overlap_loss = overlap_loss * size_compensation
-            overlap_loss = overlap_loss.sum() / batch_size
-            losses["overlap"] = overlap_loss * self.overlap_loss_multiplier
-        else:
-            losses["overlap"] = torch.tensor(0.0, device=device)
-
-        if pred_classprob and target_label:
-            pred_classprob = torch.cat(pred_classprob)
-            target_label = torch.cat(target_label)
-            target_classprob = torch.nn.functional.one_hot(target_label, self.num_classes)
-            target_classprob = target_classprob.to(dtype=pred_classprob.dtype)
-            class_loss = self.class_loss_func(pred_classprob, target_classprob)
-            class_loss = class_loss.sum() / batch_size
-            losses["class"] = class_loss * self.class_loss_multiplier
-        else:
-            losses["class"] = torch.tensor(0.0, device=device)
-
-        pred_low_confidence = confidence[lc_mask]
-        target_low_confidence = torch.zeros_like(pred_low_confidence)
-        if pred_confidence:
-            pred_high_confidence = torch.cat(pred_confidence)
-            target_high_confidence = torch.ones_like(pred_high_confidence)
-            pred_confidence = torch.cat((pred_low_confidence, pred_high_confidence))
-            target_confidence = torch.cat((target_low_confidence, target_high_confidence))
-        else:
-            pred_confidence = pred_low_confidence
-            target_confidence = target_low_confidence
-        confidence_loss = self.confidence_loss_func(pred_confidence, target_confidence)
-        confidence_loss = confidence_loss.sum() / batch_size
-        losses["confidence"] = confidence_loss * self.confidence_loss_multiplier
-
-        return losses, hits
+                device = image_preds["confidences"].device
+                matched_preds = {
+                    "boxes": torch.empty((0, 4), device=device),
+                    "confidences": torch.empty(0, device=device),
+                    "bg_confidences": image_preds["confidences"].flatten(),
+                    "classprobs": torch.empty((0, self.num_classes), device=device),
+                }
+                matched_targets = {
+                    "boxes": torch.empty((0, 4), device=device),
+                    "labels": torch.empty(0, dtype=torch.int64, device=device),
+                }
+            matches.append((matched_preds, matched_targets))
+
+        matched_preds = {
+            "boxes": torch.cat(tuple(m[0]["boxes"] for m in matches)),
+            "confidences": torch.cat(tuple(m[0]["confidences"] for m in matches)),
+            "bg_confidences": torch.cat(tuple(m[0]["bg_confidences"] for m in matches)),
+            "classprobs": torch.cat(tuple(m[0]["classprobs"] for m in matches)),
+        }
+        matched_targets = {
+            "boxes": torch.cat(tuple(m[1]["boxes"] for m in matches)),
+            "labels": torch.cat(tuple(m[1]["labels"] for m in matches)),
+        }
+        self.loss_func(matched_preds, matched_targets, self.input_is_normalized, image_size)
+        overlap_loss, confidence_loss, class_loss = self.loss_func.sums()
+        self.losses = torch.stack((overlap_loss, confidence_loss, class_loss)) / batch_size
+        self.hits = len(matched_targets["boxes"])
 
 
 class Mish(nn.Module):
diff --git a/pl_bolts/models/detection/yolo/yolo_loss.py b/pl_bolts/models/detection/yolo/yolo_loss.py
new file mode 100644
index 0000000000..5f3f38476a
--- /dev/null
+++ b/pl_bolts/models/detection/yolo/yolo_loss.py
@@ -0,0 +1,255 @@
+import math
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.nn.functional import binary_cross_entropy, binary_cross_entropy_with_logits
+from torchvision.ops import box_iou, generalized_box_iou
+
+
+def _upcast(t: Tensor) -> Tensor:
+    """Protects from numerical overflows in multiplications by upcasting to the equivalent higher type."""
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def complete_iou(boxes1: Tensor, boxes2: Tensor, distance_only: bool = False) -> Tensor:
+    """Returns the complete intersection-over-union between two sets of boxes. Both sets of boxes are expected to
+    be in `(x1, y1, x2, y2)` format.
+
+    Args:
+        boxes1: Box coordinates in a tensor of size ``[N, 4]``.
+        boxes2: Box coordinates in a tensor of size ``[M, 4]``.
+        distance_only: If set to ``True``, returns the Distance IoU.
+
+    Returns:
+        A matrix containing the `NxM` complete IoU values between boxes from ``boxes1`` and ``boxes2``.
+    """
+
+    # Degenerate boxes give inf / nan results, so do an early check.
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+
+    iou = box_iou(boxes1, boxes2)
+
+    boxes1 = boxes1.unsqueeze(1)  # [N, 1, 4]
+    boxes2 = boxes2.unsqueeze(0)  # [1, M, 4]
+
+    lti = torch.min(boxes1[..., :2], boxes2[..., :2])
+    rbi = torch.max(boxes1[..., 2:], boxes2[..., 2:])
+
+    whi = _upcast(rbi - lti).clamp(min=0)  # [N, M, 2]
+    wi = whi[..., 0]
+    hi = whi[..., 1]
+    sqr_length = wi * wi + hi * hi  # [N, M]
+
+    wh1 = boxes1[..., 2:] - boxes1[..., :2]
+    wh2 = boxes2[..., 2:] - boxes2[..., :2]
+    center1 = boxes1[..., :2] + (wh1 / 2)
+    center2 = boxes2[..., :2] + (wh2 / 2)
+    offset = center2 - center1  # [N, M, 2]
+    dx = offset[..., 0]
+    dy = offset[..., 1]
+    sqr_distance = dx * dx + dy * dy  # [N, M]
+
+    diou = torch.where(sqr_length > 0.0, iou - (sqr_distance / sqr_length), iou)
+    if distance_only:
+        return diou
+
+    w1 = wh1[..., 0]
+    h1 = wh1[..., 1]
+    w2 = wh2[..., 0]
+    h2 = wh2[..., 1]
+    daspect = torch.atan(w2 / h2) - torch.atan(w1 / h1)  # [N, M]
+    aspect_loss = 4 / (math.pi * math.pi) * (daspect * daspect)
+
+    with torch.no_grad():
+        alpha = aspect_loss / (1 - iou + aspect_loss + 1e-6)
+
+    return diou - (alpha * aspect_loss)
+
+
+def distance_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    return complete_iou(boxes1, boxes2, distance_only=True)
+
+
+class LossFunction:
+    """A class for calculating the YOLO losses from predictions and targets.
+
+    Args:
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. The function has to
+            return a tensor with as many elements as there are input boxes.
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Multiply the overlap loss by this factor.
+        confidence_loss_multiplier: Multiply the confidence loss by this factor.
+        class_loss_multiplier: Multiply the classification loss by this factor.
+    """
+
+    def __init__(
+        self,
+        overlap_func: Union[str, Callable] = "iou",
+        predict_overlap: Optional[float] = None,
+        overlap_multiplier: float = 1.0,
+        confidence_multiplier: float = 1.0,
+        class_multiplier: float = 1.0,
+    ):
+        if overlap_func == "iou":
+            self.overlap_func = box_iou
+        elif overlap_func == "giou":
+            self.overlap_func = generalized_box_iou
+        elif overlap_func == "diou":
+            self.overlap_func = distance_iou
+        elif overlap_func == "ciou":
+            self.overlap_func = complete_iou
+        elif callable(overlap_func):
+            self.overlap_func = overlap_func
+        else:
+            raise ValueError("Unknown overlap function: " + overlap_func)
+
+        self.predict_overlap = predict_overlap
+
+        self.overlap_multiplier = overlap_multiplier
+        self.confidence_multiplier = confidence_multiplier
+        self.class_multiplier = class_multiplier
+
+    def _calculate_overlap(
+        self, preds: Tensor, targets: Tensor, image_size: Optional[Tensor] = None
+    ) -> Tuple[Tensor, Tensor]:
+        """Calculates the overlap and overlap loss.
+
+        The overlap is calculated using ``self.overlap_func``. Overlap loss is ``1 - overlap``. If ``image_size`` is
+        given, the loss is scaled by a factor that is large for small boxes (the maximum value is 2) and small for large
+        boxes (the minimum value is 1).
+
+        Args:
+            preds: An ``[N, 4]`` matrix of predicted `(x1, y1, x2, y2)` coordinates.
+            targets: An ``[M, 4]`` matrix of target `(x1, y1, x2, y2)` coordinates.
+            image_size: If given,
+
+        Returns:
+            overlap, overlap_loss: Two ``[M, N]`` matrices: the overlap and the overlap loss between all combinations of
+                a target and a prediction.
+        """
+        overlap = self.overlap_func(targets, preds)
+        overlap_loss = 1.0 - overlap
+        if image_size is not None:
+            unit_wh = targets[:, 2:] / image_size
+            size_compensation = 2 - (unit_wh[:, 0] * unit_wh[:, 1])
+            overlap_loss = overlap_loss * size_compensation
+        return overlap, overlap_loss
+
+    def _calculate_confidence(self, preds: Tensor, overlap: Tensor, bce_func: Callable):
+        """Calculates the confidence loss for foreground anchors.
+
+        If ``self.predict_overlap`` is ``True``, ``overlap`` will be used as the target confidence. Otherwise the target
+        confidence is 1. The method returns a matrix of losses for target/prediction pairs.
+
+        Args:
+            preds: An ``[N]`` vector of predicted confidences.
+            overlap: An ``[M, N]`` matrix of the overlap between all combinations of a target bounding box and a
+                predicted bounding box.
+            bce_func: A function for calculating binary cross entropy.
+
+        Returns:
+            An ``[M, N]`` matrix of confidence loss between all combinations of a target and a prediction.
+        """
+        if self.predict_overlap is not None:
+            # When predicting overlap, target confidence is different for each pair of a prediction and a target. The
+            # tensors have to be broadcasted to [M, N].
+            preds = preds.unsqueeze(0)
+            preds = torch.broadcast_to(preds, overlap.shape)
+            targets = torch.ones_like(preds) - self.predict_overlap
+            # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
+            targets = targets + (self.predict_overlap * overlap.detach().clamp(min=0))
+        else:
+            targets = torch.ones_like(preds)
+
+        result = bce_func(preds, targets, reduction="none")
+
+        if result.ndim == 1:
+            # When not predicting overlap, target confidence is the same for every target, but we should still return a
+            # matrix.
+            result = result.unsqueeze(0)
+            torch.broadcast_to(result, overlap.shape)
+
+        return result
+
+    def _calculate_bg_confidence(self, preds: Tensor, bce_func: Callable):
+        """Calculates the confidence loss for background anchors."""
+        targets = torch.zeros_like(preds)
+        return bce_func(preds, targets, reduction="none")
+
+    def _calculate_class(self, preds: Tensor, targets: Tensor, bce_func: Callable) -> Tensor:
+        """Calculates the classification losses.
+
+        If ``targets`` is a vector of class labels, converts it to a matrix of one-hot class probabilities. Then
+        calculates the classification losses between the predictions and the targets. If ``all_pairs`` is ``True``,
+        returns a matrix of losses between all combinations of a target and a prediction.
+
+        Args:
+            preds: An ``[N, C]`` matrix of predicted class probabilities.
+            targets: An ``[M, C]`` matrix of target class probabilities or an ``[M]`` vector of class labels.
+            bce_func: A function for calculating binary cross entropy.
+
+        Returns:
+            An ``[M, N]`` matrix of losses between all combinations of a target and a prediction.
+        """
+        num_classes = preds.shape[-1]
+        if targets.ndim == 1:
+            # The data may contain a different number of classes than what the model predicts. In case a label is
+            # greater than the number of predicted classes, it will be mapped to the last class.
+            last_class = torch.tensor(num_classes - 1, device=targets.device)
+            targets = torch.min(targets, last_class)
+            targets = torch.nn.functional.one_hot(targets, num_classes)
+        elif targets.shape[-1] != num_classes:
+            raise ValueError(
+                f"The number of classes in the data ({targets.shape[-1]}) doesn't match the number of classes "
+                f"predicted by the model ({num_classes})."
+            )
+        targets = targets.to(dtype=preds.dtype)
+
+        preds = preds.unsqueeze(0)  # [1, preds, classes]
+        targets = targets.unsqueeze(1)  # [targets, 1, classes]
+        preds, targets = torch.broadcast_tensors(preds, targets)
+        return bce_func(preds, targets, reduction="none").sum(-1)
+
+    def __call__(self, preds, targets, input_is_normalized: bool, image_size: Optional[Tensor] = None):
+        """Calculates the losses for all pairs of a predictions and a target, and if `bg_confidences` appears in
+        ``preds``, calculates the confidence loss for background predictions.
+
+        This method is called before taking the final losses using ``sums()``, and for obtaining costs for SimOTA
+        matching.
+
+        Args:
+            preds: A dictionary of predictions, containing "boxes", "confidences", and "classprobs".
+            targets: A dictionary of training targets, containing "boxes" and "labels".
+            input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
+            image_size: Width and height in a vector that defines the scale of the target coordinates.
+        """
+        bce_func = binary_cross_entropy if input_is_normalized else binary_cross_entropy_with_logits
+
+        overlap, overlap_loss = self._calculate_overlap(preds["boxes"], targets["boxes"], image_size)
+        self.overlap = overlap
+        self.overlap_loss = overlap_loss * self.overlap_multiplier
+
+        confidence_loss = self._calculate_confidence(preds["confidences"], overlap, bce_func)
+        self.confidence_loss = confidence_loss * self.confidence_multiplier
+
+        if "bg_confidences" in preds:
+            bg_confidence_loss = self._calculate_bg_confidence(preds["bg_confidences"], bce_func)
+            self.bg_confidence_loss = bg_confidence_loss * self.confidence_multiplier
+
+        class_loss = self._calculate_class(preds["classprobs"], targets["labels"], bce_func)
+        self.class_loss = class_loss * self.class_multiplier
+
+    def sums(self):
+        """Returns the sums of the losses over prediction/target pairs, assuming the predictions and targets have
+        been matched (there are as many predictions and targets)."""
+        overlap_loss = self.overlap_loss.diagonal().sum()
+        confidence_loss = self.confidence_loss.diagonal().sum() + self.bg_confidence_loss.sum()
+        class_loss = self.class_loss.diagonal().sum()
+        return overlap_loss, confidence_loss, class_loss
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index ebb494f5ef..fac07e7f4f 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -1,101 +1,102 @@
-import logging
-from typing import Any, Dict, List, Optional, Tuple, Type
+import io
+from copy import copy
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import numpy as np
 import torch
 import torch.nn as nn
 from pytorch_lightning import LightningModule
-from pytorch_lightning.utilities import rank_zero_info
+from pytorch_lightning.utilities.cli import LightningCLI
+from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info
 from torch import Tensor, optim
 
+from pl_bolts.datamodules import VOCDetectionDataModule
+from pl_bolts.datamodules.vocdetection_datamodule import Compose
+from pl_bolts.models.detection.yolo.darknet_configuration import DarknetConfiguration
 from pl_bolts.models.detection.yolo.yolo_layers import DetectionLayer, RouteLayer, ShortcutLayer
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
 
 if _TORCHVISION_AVAILABLE:
-    from torchvision.ops import nms
+    from torchvision.ops import batched_nms
     from torchvision.transforms import functional as F
 else:
     warn_missing_pkg("torchvision")
 
-log = logging.getLogger(__name__)
-
 
 class YOLO(LightningModule):
-    """PyTorch Lightning implementation of YOLOv3 and YOLOv4.
+    """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4, YOLOv5,
+    Scaled-YOLOv4, and YOLOX.
 
     *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`_
 
     *YOLOv4 paper*: `Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2004.10934>`_
 
+    *Scaled-YOLOv4 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2011.08036>`_
+
+    *YOLOX paper*: `Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun <https://arxiv.org/abs/2107.08430>`_
+
     *Implementation*: `Seppo Enarvi <https://github.com/senarvi>`_
 
     The network architecture can be read from a Darknet configuration file using the
-    :class:`~pl_bolts.models.detection.yolo.yolo_config.YOLOConfiguration` class, or created by
-    some other means, and provided as a list of PyTorch modules.
+    :class:`~pl_bolts.models.detection.yolo.darknet_configuration.DarknetConfiguration` class, or created by some other
+    means, and provided as a list of PyTorch modules.
 
-    The input from the data loader is expected to be a list of images. Each image is a tensor with
-    shape ``[channels, height, width]``. The images from a single batch will be stacked into a
-    single tensor, so the sizes have to match. Different batches can have different image sizes, as
-    long as the size is divisible by the ratio in which the network downsamples the input.
+    The input from the data loader is expected to be a list of images. Each image is a tensor with shape
+    ``[channels, height, width]``. The images from a single batch will be stacked into a single tensor, so the sizes
+    have to match. Different batches can have different image sizes, as long as the size is divisible by the ratio in
+    which the network downsamples the input.
 
-    During training, the model expects both the input tensors and a list of targets. *Each target is
-    a dictionary containing*:
+    During training, the model expects both the input tensors and a list of targets. *Each target is a dictionary
+    containing the following tensors*:
 
     - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in `(x1, y1, x2, y2)` format
-    - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+    - labels (``Int64Tensor[N]`` or ``BoolTensor[N, classes]``): the class label or a boolean class mask for each
+      ground-truth box
 
-    :func:`~pl_bolts.models.detection.yolo.yolo_module.YOLO.forward` method returns all
-    predictions from all detection layers in all images in one tensor with shape
-    ``[images, predictors, classes + 5]``. The coordinates are scaled to the input image size.
-    During training it also returns a dictionary containing the classification, box overlap, and
-    confidence losses.
+    :func:`~pl_bolts.models.detection.yolo.yolo_module.YOLO.forward` method returns all predictions from all detection
+    layers in one tensor with shape ``[images, predictors, classes + 5]``. The coordinates are scaled to the input image
+    size. During training it also returns a dictionary containing the classification, box overlap, and confidence
+    losses.
 
-    During inference, the model requires only the input tensors.
-    :func:`~pl_bolts.models.detection.yolo.yolo_module.YOLO.infer` method filters and processes the
-    predictions. *The processed output includes the following tensors*:
+    During inference, the model requires only the image tensors.
+    :func:`~pl_bolts.models.detection.yolo.yolo_module.YOLO.infer` method filters and processes the predictions. If a
+    prediction has a high score for more than one class, it will be duplicated. *The processed output is returned in a
+    dictionary containing the following tensors*:
 
     - boxes (``FloatTensor[N, 4]``): predicted bounding box `(x1, y1, x2, y2)` coordinates in image space
     - scores (``FloatTensor[N]``): detection confidences
-    - labels (``Int64Tensor[N]``): the predicted labels for each image
+    - labels (``Int64Tensor[N]``): the predicted labels for each object
 
     Weights can be loaded from a Darknet model file using ``load_darknet_weights()``.
 
-    CLI command::
-
-        # PascalVOC
-        wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny-3l.cfg
-        python yolo_module.py --config yolov4-tiny-3l.cfg --data_dir . --gpus 8 --batch_size 8
+    Args:
+        network: A list of network modules. This can be obtained from a Darknet configuration using the
+            :func:`~pl_bolts.models.detection.yolo.yolo_config.YOLOConfiguration.get_network` method.
+        optimizer: Which optimizer class to use for training.
+        optimizer_params: Parameters to pass to the optimizer constructor. Weight decay will be applied only to
+            convolutional layer weights.
+        lr_scheduler: Which learning rate scheduler class to use for training.
+        lr_scheduler_params: Parameters to pass to the learning rate scheduler constructor.
+        confidence_threshold: Postprocessing will remove bounding boxes whose confidence score is not higher than this
+            threshold.
+        nms_threshold: Non-maximum suppression will remove bounding boxes whose IoU with a higher confidence box is
+            higher than this threshold, if the predicted categories are equal.
+        detections_per_image: Keep at most this number of highest-confidence detections per image.
     """
 
     def __init__(
         self,
         network: nn.ModuleList,
         optimizer: Type[optim.Optimizer] = optim.SGD,
-        optimizer_params: Dict[str, Any] = {"lr": 0.001, "momentum": 0.9, "weight_decay": 0.0005},
+        optimizer_params: Dict[str, Any] = {"lr": 0.01, "momentum": 0.9, "weight_decay": 0.0005},
         lr_scheduler: Type[optim.lr_scheduler._LRScheduler] = LinearWarmupCosineAnnealingLR,
-        lr_scheduler_params: Dict[str, Any] = {"warmup_epochs": 1, "max_epochs": 300, "warmup_start_lr": 0.0},
+        lr_scheduler_params: Dict[str, Any] = {"warmup_epochs": 5, "max_epochs": 300, "warmup_start_lr": 0.0},
         confidence_threshold: float = 0.2,
         nms_threshold: float = 0.45,
-        max_predictions_per_image: int = -1,
+        detections_per_image: int = 300,
     ) -> None:
-        """
-        Args:
-            network: A list of network modules. This can be obtained from a Darknet configuration
-                using the :func:`~pl_bolts.models.detection.yolo.yolo_config.YOLOConfiguration.get_network`
-                method.
-            optimizer: Which optimizer class to use for training.
-            optimizer_params: Parameters to pass to the optimizer constructor.
-            lr_scheduler: Which learning rate scheduler class to use for training.
-            lr_scheduler_params: Parameters to pass to the learning rate scheduler constructor.
-            confidence_threshold: Postprocessing will remove bounding boxes whose
-                confidence score is not higher than this threshold.
-            nms_threshold: Non-maximum suppression will remove bounding boxes whose IoU with a higher
-                confidence box is higher than this threshold, if the predicted categories are equal.
-            max_predictions_per_image: If non-negative, keep at most this number of
-                highest-confidence predictions per image.
-        """
         super().__init__()
 
         if not _TORCHVISION_AVAILABLE:
@@ -108,34 +109,27 @@ def __init__(
         self.lr_scheduler_params = lr_scheduler_params
         self.confidence_threshold = confidence_threshold
         self.nms_threshold = nms_threshold
-        self.max_predictions_per_image = max_predictions_per_image
+        self.detections_per_image = detections_per_image
 
-    def forward(
-        self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None
-    ) -> Tuple[Tensor, Dict[str, Tensor]]:
+    def forward(self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
         """Runs a forward pass through the network (all layers listed in ``self.network``), and if training targets
         are provided, computes the losses from the detection layers.
 
-        Detections are concatenated from the detection layers. Each image will produce
-        `N * num_anchors * grid_height * grid_width` detections, where `N` depends on the number of
-        detection layers. For one detection layer `N = 1`, and each detection layer increases it by
-        a number that depends on the size of the feature map on that layer. For example, if the
-        feature map is twice as wide and high as the grid, the layer will add four times more
-        features.
+        Detections are concatenated from the detection layers. Each detection layer will produce a number of detections
+        that depends on the size of the feature map and the number of anchors per grid cell.
 
         Args:
             images: Images to be processed. Tensor of size
-                ``[batch_size, num_channels, height, width]``.
+                ``[batch_size, channels, height, width]``.
             targets: If set, computes losses from detection layers against these targets. A list of
-                dictionaries, one for each image.
+                target dictionaries, one for each image.
 
         Returns:
-            detections (:class:`~torch.Tensor`), losses (Dict[str, :class:`~torch.Tensor`]):
-            Detections, and if targets were provided, a dictionary of losses. Detections are shaped
-            ``[batch_size, num_predictors, num_classes + 5]``, where ``num_predictors`` is the
-            total number of cells in all detection layers times the number of boxes predicted by
-            one cell. The predicted box coordinates are in `(x1, y1, x2, y2)` format and scaled to
-            the input image size.
+            detections (:class:`~torch.Tensor`), losses (Dict[str, :class:`~torch.Tensor`]): Detections, and if targets
+            were provided, a dictionary of losses. Detections are shaped
+            ``[batch_size, predictors, classes + 5]``, where ``predictors`` is the total number of cells in all
+            detection layers times the number of boxes predicted by one cell. The predicted box coordinates are in
+            `(x1, y1, x2, y2)` format and scaled to the input image size.
         """
         outputs = []  # Outputs from all layers
         detections = []  # Outputs from detection layers
@@ -147,20 +141,20 @@ def forward(
         image_size = torch.tensor([image_width, image_height], device=images.device)
 
         x = images
-        for module in self.network:
-            if isinstance(module, (RouteLayer, ShortcutLayer)):
-                x = module(x, outputs)
-            elif isinstance(module, DetectionLayer):
+        for layer in self.network:
+            if isinstance(layer, (RouteLayer, ShortcutLayer)):
+                x = layer(x, outputs)
+            elif isinstance(layer, DetectionLayer):
                 if targets is None:
-                    x = module(x, image_size)
+                    x = layer(x, image_size)
                     detections.append(x)
                 else:
-                    x, layer_losses, layer_hits = module(x, image_size, targets)
+                    x = layer(x, image_size, targets)
                     detections.append(x)
-                    losses.append(layer_losses)
-                    hits.append(layer_hits)
+                    losses.append(layer.losses)
+                    hits.append(layer.hits)
             else:
-                x = module(x)
+                x = layer(x)
 
             outputs.append(x)
 
@@ -169,27 +163,38 @@ def forward(
             return detections
 
         total_hits = sum(hits)
-        num_targets = sum(len(image_targets["boxes"]) for image_targets in targets)
-        if total_hits != num_targets:
-            log.warning(
-                f"{num_targets} training targets were matched a total of {total_hits} times by detection layers. "
-                "Anchors may have been configured incorrectly."
-            )
         for layer_idx, layer_hits in enumerate(hits):
             hit_rate = torch.true_divide(layer_hits, total_hits) if total_hits > 0 else 1.0
             self.log(f"layer_{layer_idx}_hit_rate", hit_rate, sync_dist=False)
 
-        def total_loss(loss_name):
-            """Returns the sum of the loss over detection layers."""
-            loss_tuple = tuple(layer_losses[loss_name] for layer_losses in losses)
-            return torch.stack(loss_tuple).sum()
-
-        losses = {loss_name: total_loss(loss_name) for loss_name in losses[0].keys()}
+        losses = torch.stack(losses).sum(0)
         return detections, losses
 
     def configure_optimizers(self) -> Tuple[List, List]:
-        """Constructs the optimizer and learning rate scheduler."""
-        optimizer = self.optimizer_class(self.parameters(), **self.optimizer_params)
+        """Constructs the optimizer and learning rate scheduler based on ``self.optimizer_params`` and
+        ``self.lr_scheduler_params``.
+
+        If weight decay is specified, it will be applied only to convolutional layer weights.
+        """
+        if ("weight_decay" in self.optimizer_params) and (self.optimizer_params["weight_decay"] != 0):
+            defaults = copy(self.optimizer_params)
+            weight_decay = defaults.pop("weight_decay")
+
+            default_group = []
+            wd_group = []
+            for name, tensor in self.named_parameters():
+                if name.endswith(".conv.weight"):
+                    wd_group.append(tensor)
+                else:
+                    default_group.append(tensor)
+
+            params = [
+                {"params": default_group, "weight_decay": 0.0},
+                {"params": wd_group, "weight_decay": weight_decay},
+            ]
+            optimizer = self.optimizer_class(params, **defaults)
+        else:
+            optimizer = self.optimizer_class(self.parameters(), **self.optimizer_params)
         lr_scheduler = self.lr_scheduler_class(optimizer, **self.lr_scheduler_params)
         return [optimizer], [lr_scheduler]
 
@@ -197,8 +202,8 @@ def training_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], bat
         """Computes the training loss.
 
         Args:
-            batch: A tuple of images and targets. Images is a list of 3-dimensional tensors.
-                Targets is a list of dictionaries that contain ground-truth boxes, labels, etc.
+            batch: A tuple of images and targets. Images is a list of 3-dimensional tensors. Targets is a list of target
+                dictionaries.
             batch_idx: The index of this batch.
 
         Returns:
@@ -206,89 +211,85 @@ def training_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], bat
         """
         images, targets = self._validate_batch(batch)
         _, losses = self(images, targets)
-        total_loss = torch.stack(tuple(losses.values())).sum()
 
         # sync_dist=True is broken in some versions of Lightning and may cause the sum of the loss
         # across GPUs to be returned.
-        for name, value in losses.items():
-            self.log(f"train/{name}_loss", value, prog_bar=True, sync_dist=False)
-        self.log("train/total_loss", total_loss, sync_dist=False)
+        self.log("train/overlap_loss", losses[0], prog_bar=True, sync_dist=False)
+        self.log("train/confidence_loss", losses[1], prog_bar=True, sync_dist=False)
+        self.log("train/class_loss", losses[2], prog_bar=True, sync_dist=False)
+        self.log("train/total_loss", losses.sum(), sync_dist=False)
 
-        return {"loss": total_loss}
+        return {"loss": losses.sum()}
 
     def validation_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_idx: int):
         """Evaluates a batch of data from the validation set.
 
         Args:
-            batch: A tuple of images and targets. Images is a list of 3-dimensional tensors.
-                Targets is a list of dictionaries that contain ground-truth boxes, labels, etc.
+            batch: A tuple of images and targets. Images is a list of 3-dimensional tensors. Targets is a list of target
+                dictionaries.
             batch_idx: The index of this batch
         """
         images, targets = self._validate_batch(batch)
-        detections, losses = self(images, targets)
-        detections = self._split_detections(detections)
-        detections = self._filter_detections(detections)
-        total_loss = torch.stack(tuple(losses.values())).sum()
+        _, losses = self(images, targets)
 
-        for name, value in losses.items():
-            self.log(f"val/{name}_loss", value, sync_dist=True)
-        self.log("val/total_loss", total_loss, sync_dist=True)
+        self.log("val/overlap_loss", losses[0], sync_dist=True)
+        self.log("val/confidence_loss", losses[1], sync_dist=True)
+        self.log("val/class_loss", losses[2], sync_dist=True)
+        self.log("val/total_loss", losses.sum(), sync_dist=True)
 
     def test_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_idx: int):
         """Evaluates a batch of data from the test set.
 
         Args:
-            batch: A tuple of images and targets. Images is a list of 3-dimensional tensors.
-                Targets is a list of dictionaries that contain ground-truth boxes, labels, etc.
+            batch: A tuple of images and targets. Images is a list of 3-dimensional tensors. Targets is a list of target
+                dictionaries.
             batch_idx: The index of this batch.
         """
         images, targets = self._validate_batch(batch)
-        detections, losses = self(images, targets)
-        detections = self._split_detections(detections)
-        detections = self._filter_detections(detections)
-        total_loss = torch.stack(tuple(losses.values())).sum()
+        _, losses = self(images, targets)
 
-        for name, value in losses.items():
-            self.log(f"test/{name}_loss", value, sync_dist=True)
-        self.log("test/total_loss", total_loss, sync_dist=True)
+        self.log("test/overlap_loss", losses[0], sync_dist=True)
+        self.log("test/confidence_loss", losses[1], sync_dist=True)
+        self.log("test/class_loss", losses[2], sync_dist=True)
+        self.log("test/total_loss", losses.sum(), sync_dist=True)
 
-    def infer(self, image: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+    def infer(self, image: Tensor) -> Dict[str, Tensor]:
         """Feeds an image to the network and returns the detected bounding boxes, confidence scores, and class
         labels.
 
+        If a prediction has a high score for more than one class, it will be duplicated.
+
         Args:
             image: An input image, a tensor of uint8 values sized ``[channels, height, width]``.
 
         Returns:
-            boxes (:class:`~torch.Tensor`), confidences (:class:`~torch.Tensor`), labels (:class:`~torch.Tensor`):
-            A matrix of detected bounding box `(x1, y1, x2, y2)` coordinates, a vector of
-            confidences for the bounding box detections, and a vector of predicted class labels.
+            A dictionary containing tensors "boxes", "scores", and "labels". "boxes" is a matrix of detected bounding
+            box `(x1, y1, x2, y2)` coordinates. "scores" is a vector of confidence scores for the bounding box
+            detections. "labels" is a vector of predicted class labels.
         """
         if not isinstance(image, torch.Tensor):
             image = F.to_tensor(image)
 
         self.eval()
         detections = self(image.unsqueeze(0))
-        detections = self._split_detections(detections)
-        detections = self._filter_detections(detections)
-        boxes = detections["boxes"][0]
-        scores = detections["scores"][0]
-        labels = detections["labels"][0]
-        return boxes, scores, labels
+        detections = self.process_detections(detections)
+        return detections[0]
 
     def load_darknet_weights(self, weight_file):
         """Loads weights to layer modules from a pretrained Darknet model.
 
-        One may want to continue training from the pretrained weights, on a dataset with a
-        different number of object categories. The number of kernels in the convolutional layers
-        just before each detection layer depends on the number of output classes. The Darknet
-        solution is to truncate the weight file and stop reading weights at the first incompatible
-        layer. For this reason the function silently leaves the rest of the layers unchanged, when
+        One may want to continue training from pretrained weights, on a dataset with a different number of object
+        categories. The number of kernels in the convolutional layers just before each detection layer depends on the
+        number of output classes. The Darknet solution is to truncate the weight file and stop reading weights at the
+        first incompatible layer. For this reason the function silently leaves the rest of the layers unchanged, when
         the weight file ends.
 
         Args:
             weight_file: A file object containing model weights in the Darknet binary format.
         """
+        if not isinstance(weight_file, io.IOBase):
+            raise ValueError("weight_file must be a file-like object.")
+
         version = np.fromfile(weight_file, count=3, dtype=np.int32)
         images_seen = np.fromfile(weight_file, count=1, dtype=np.int64)
         rank_zero_info(
@@ -302,24 +303,24 @@ def read(tensor):
             If there's no more data in ``weight_file``, returns without error.
             """
             x = np.fromfile(weight_file, count=tensor.numel(), dtype=np.float32)
-            if x.shape[0] == 0:
-                return
-            x = torch.from_numpy(x).view_as(tensor)
-            with torch.no_grad():
-                tensor.copy_(x)
+            if x.size > 0:
+                x = torch.from_numpy(x).view_as(tensor)
+                with torch.no_grad():
+                    tensor.copy_(x)
+            return x.size
 
-        for module in self.network:
+        for layer_idx, layer in enumerate(self.network):
             # Weights are loaded only to convolutional layers
-            if not isinstance(module, nn.Sequential):
+            if not (isinstance(layer, nn.Sequential) and isinstance(layer[0], nn.Conv2d)):
                 continue
 
-            conv = module[0]
-            assert isinstance(conv, nn.Conv2d)
+            conv = layer[0]
+            rank_zero_debug(f"Reading weights for layer {layer_idx}: {list(conv.weight.shape)}")
 
             # Convolution may be followed by batch normalization, in which case we read the batch
             # normalization parameters and not the convolution bias.
-            if len(module) > 1 and isinstance(module[1], nn.BatchNorm2d):
-                bn = module[1]
+            if len(layer) > 1 and isinstance(layer[1], nn.BatchNorm2d):
+                bn = layer[1]
                 read(bn.bias)
                 read(bn.weight)
                 read(bn.running_mean)
@@ -327,7 +328,76 @@ def read(tensor):
             else:
                 read(conv.bias)
 
-            read(conv.weight)
+            read_count = read(conv.weight)
+            if read_count == 0:
+                return
+
+    def process_detections(self, preds: Tensor) -> List[Dict[str, Tensor]]:
+        """Splits the detection tensor returned by a forward pass into a list of prediction dictionaries, and
+        filters them based on confidence threshold, non-maximum suppression (NMS), and maximum number of
+        predictions.
+
+        If for any single detection there are multiple categories whose score is above the confidence threshold, the
+        detection will be duplicated to create one detection for each category. NMS processes one category at a time,
+        iterating over the bounding boxes in descending order of confidence score, and removes lower scoring boxes that
+        have an IoU greater than the NMS threshold with a higher scoring box.
+
+        The returned detections are sorted by descending confidence. The items of the dictionaries are as follows:
+
+        - boxes (``Tensor[batch_size, N, 4]``): detected bounding box `(x1, y1, x2, y2)` coordinates
+        - scores (``Tensor[batch_size, N]``): detection confidences
+        - labels (``Int64Tensor[batch_size, N]``): the predicted class IDs
+
+        Args:
+            preds: A tensor of detected bounding boxes and their attributes.
+
+        Returns:
+            Filtered detections. A list of prediction dictionaries, one for each image.
+        """
+        result = []
+
+        for image_preds in preds:
+            boxes = image_preds[..., :4]
+            confidences = image_preds[..., 4]
+            classprobs = image_preds[..., 5:]
+            scores = classprobs * confidences[:, None]
+
+            # Select predictions with high scores. If a prediction has a high score for more than one class, it will be
+            # duplicated.
+            idxs, labels = (scores > self.confidence_threshold).nonzero().T
+            boxes = boxes[idxs]
+            scores = scores[idxs, labels]
+
+            keep = batched_nms(boxes, scores, labels, self.nms_threshold)
+            keep = keep[: self.detections_per_image]
+            boxes = boxes[keep]
+            scores = scores[keep]
+            labels = labels[keep]
+            result.append({"boxes": boxes, "scores": scores, "labels": labels})
+
+        return result
+
+    def process_targets(self, targets: List[Dict[str, Tensor]]) -> List[Dict[str, Tensor]]:
+        """Duplicates multi-label targets to create one target for each label.
+
+        Args:
+            targets: List of target dictionaries. Each dictionary must contain "boxes" and "labels". "labels" is either
+                a one-dimensional list of class IDs, or a two-dimensional boolean class map.
+
+        Returns:
+            Single-label targets. A list of target dictionaries, one for each image.
+        """
+        result = []
+
+        for image_targets in targets:
+            boxes = image_targets["boxes"]
+            labels = image_targets["labels"]
+            if labels.ndim == 2:
+                idxs, labels = labels.nonzero().T
+                boxes = boxes[idxs]
+            result.append({"boxes": boxes, "labels": labels})
+
+        return result
 
     def _validate_batch(
         self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]]
@@ -353,257 +423,127 @@ def _validate_batch(
             boxes = target["boxes"]
             if not isinstance(boxes, Tensor):
                 raise ValueError(f"Expected target boxes to be of type Tensor, got {type(boxes)}.")
-            if (len(boxes.shape) != 2) or (boxes.shape[-1] != 4):
+            if (boxes.ndim != 2) or (boxes.shape[-1] != 4):
                 raise ValueError(f"Expected target boxes to be tensors of shape [N, 4], got {list(boxes.shape)}.")
             labels = target["labels"]
             if not isinstance(labels, Tensor):
                 raise ValueError(f"Expected target labels to be of type Tensor, got {type(labels)}.")
-            if len(labels.shape) != 1:
-                raise ValueError(f"Expected target labels to be tensors of shape [N], got {list(labels.shape)}.")
+            if (labels.ndim < 1) or (labels.ndim > 2) or (len(labels) != len(boxes)):
+                raise ValueError(
+                    f"Expected target labels to be tensors of shape [N] or [N, num_classes], got {list(labels.shape)}."
+                )
 
         images = torch.stack(images)
         return images, targets
 
-    def _split_detections(self, detections: Tensor) -> Dict[str, Tensor]:
-        """Splits the detection tensor returned by a forward pass into a dictionary.
 
-        The fields of the dictionary are as follows:
-            - boxes (``Tensor[batch_size, N, 4]``): detected bounding box `(x1, y1, x2, y2)` coordinates
-            - scores (``Tensor[batch_size, N]``): detection confidences
-            - classprobs (``Tensor[batch_size, N]``): probabilities of the best classes
-            - labels (``Int64Tensor[batch_size, N]``): the predicted labels for each image
+class DarknetYOLO(YOLO):
+    """A subclass of YOLO that uses a Darknet configuration file and can be configured using LightningCLI.
 
-        Args:
-            detections: A tensor of detected bounding boxes and their attributes.
+    At most one matching algorithm, ``match_sim_ota``, ``match_size_ratio``, or ``match_iou_threshold`` can be
+    specified. If none of them is given, the default algorithm is used, which matche a target to the prior shape
+    (anchor) that gives the highest IoU.
 
-        Returns:
-            A dictionary of detection results.
-        """
-        boxes = detections[..., :4]
-        scores = detections[..., 4]
-        classprobs = detections[..., 5:]
-        classprobs, labels = torch.max(classprobs, -1)
-        return {"boxes": boxes, "scores": scores, "classprobs": classprobs, "labels": labels}
-
-    def _filter_detections(self, detections: Dict[str, Tensor]) -> Dict[str, List[Tensor]]:
-        """Filters detections based on confidence threshold. Then for every class performs non-maximum suppression
-        (NMS). NMS iterates the bounding boxes that predict this class in descending order of confidence score, and
-        removes lower scoring boxes that have an IoU greater than the NMS threshold with a higher scoring box.
-        Finally the detections are sorted by descending confidence and possible truncated to the maximum number of
-        predictions.
+    CLI command::
 
-        Args:
-            detections: All detections. A dictionary of tensors, each containing the predictions
-                from all images.
+        # PascalVOC using LightningCLI
+        wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny-3l.cfg
+        python yolo_module.py fit --model.network_config yolov4-tiny-3l.cfg --data.batch_size 8 --trainer.gpus 8 \
+            --trainer.accumulate_grad_batches 2
 
-        Returns:
-            Filtered detections. A dictionary of lists, each containing a tensor per image.
-        """
-        boxes = detections["boxes"]
-        scores = detections["scores"]
-        classprobs = detections["classprobs"]
-        labels = detections["labels"]
-
-        out_boxes = []
-        out_scores = []
-        out_classprobs = []
-        out_labels = []
-
-        for img_boxes, img_scores, img_classprobs, img_labels in zip(boxes, scores, classprobs, labels):
-            # Select detections with high confidence score.
-            selected = img_scores > self.confidence_threshold
-            img_boxes = img_boxes[selected]
-            img_scores = img_scores[selected]
-            img_classprobs = img_classprobs[selected]
-            img_labels = img_labels[selected]
-
-            img_out_boxes = boxes.new_zeros((0, 4))
-            img_out_scores = scores.new_zeros(0)
-            img_out_classprobs = classprobs.new_zeros(0)
-            img_out_labels = labels.new_zeros(0)
-
-            # Iterate through the unique object classes detected in the image and perform non-maximum
-            # suppression for the objects of the class in question.
-            for cls_label in labels.unique():
-                selected = img_labels == cls_label
-                cls_boxes = img_boxes[selected]
-                cls_scores = img_scores[selected]
-                cls_classprobs = img_classprobs[selected]
-                cls_labels = img_labels[selected]
-
-                # NMS will crash if there are too many boxes.
-                cls_boxes = cls_boxes[:100000]
-                cls_scores = cls_scores[:100000]
-                selected = nms(cls_boxes, cls_scores, self.nms_threshold)
-
-                img_out_boxes = torch.cat((img_out_boxes, cls_boxes[selected]))
-                img_out_scores = torch.cat((img_out_scores, cls_scores[selected]))
-                img_out_classprobs = torch.cat((img_out_classprobs, cls_classprobs[selected]))
-                img_out_labels = torch.cat((img_out_labels, cls_labels[selected]))
-
-            # Sort by descending confidence and limit the maximum number of predictions.
-            indices = torch.argsort(img_out_scores, descending=True)
-            if self.max_predictions_per_image >= 0:
-                indices = indices[: self.max_predictions_per_image]
-            out_boxes.append(img_out_boxes[indices])
-            out_scores.append(img_out_scores[indices])
-            out_classprobs.append(img_out_classprobs[indices])
-            out_labels.append(img_out_labels[indices])
-
-        return {"boxes": out_boxes, "scores": out_scores, "classprobs": out_classprobs, "labels": out_labels}
-
-
-class Resize:
-    """Rescales the image and target to given dimensions.
+    Args:
+        network_config: Path to a Darknet configuration file that defines the network architecture.
+        match_sim_ota: If ``True``, matches a target to an anchor using the SimOTA algorithm from YOLOX.
+        match_size_ratio: If specified, matches a target to an anchor if its width and height relative to the anchor is
+            smaller than this ratio. If ``match_size_ratio`` or ``match_iou_threshold`` is not specified, selects for
+            each target the anchor with the highest IoU.
+        match_iou_threshold: If specified, matches a target to an anchor if the IoU is higher than this threshold.
+        ignore_iou_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_loss: A function that will return the overlap loss given predicted and target boxes.
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+    """
+
+    def __init__(
+        self,
+        network_config: str,
+        match_sim_ota: bool = False,
+        match_size_ratio: Optional[float] = None,
+        match_iou_threshold: Optional[float] = None,
+        ignore_iou_threshold: Optional[float] = None,
+        overlap_loss: Optional[str] = None,
+        predict_overlap: Optional[float] = None,
+        overlap_loss_multiplier: Optional[float] = None,
+        class_loss_multiplier: Optional[float] = None,
+        confidence_loss_multiplier: Optional[float] = None,
+        **kwargs,
+    ) -> None:
+        network = DarknetConfiguration(network_config).get_network(
+            match_sim_ota=match_sim_ota,
+            match_size_ratio=match_size_ratio,
+            match_iou_threshold=match_iou_threshold,
+            ignore_iou_threshold=ignore_iou_threshold,
+            overlap_loss=overlap_loss,
+            predict_overlap=predict_overlap,
+            overlap_loss_multiplier=overlap_loss_multiplier,
+            class_loss_multiplier=class_loss_multiplier,
+            confidence_loss_multiplier=confidence_loss_multiplier,
+        )
+        super().__init__(**kwargs, network=network)
+
+
+class ResizedVOCDetectionDataModule(VOCDetectionDataModule):
+    """A subclass of VOCDetectionDataModule that resizes the images to a specific size. YOLO expectes the image size to
+    be divisible by the ratio in which the network downsamples the image.
 
     Args:
-        output_size (tuple or int): Desired output size. If tuple (height, width), the output is
-            matched to ``output_size``. If int, the smaller of the image edges is matched to
-            ``output_size``, keeping the aspect ratio the same.
+        width: Resize images to this width.
+        height: Resize images to this height.
     """
 
-    def __init__(self, output_size: tuple) -> None:
-        self.output_size = output_size
+    def __init__(self, width: int = 608, height: int = 608, **kwargs):
+        super().__init__(**kwargs)
+        self.image_size = (height, width)
+
+    def default_transforms(self) -> Callable:
+        transforms = [
+            lambda image, target: (F.to_tensor(image), target),
+            self._resize,
+        ]
+        if self.normalize:
+            transforms += [
+                lambda image, target: (
+                    F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+                    target,
+                )
+            ]
+        return Compose(transforms)
+
+    def _resize(self, image: Tensor, target: Dict[str, Any]):
+        """Rescales the image and target to ``self.image_size``.
 
-    def __call__(self, image: Tensor, target: Dict[str, Any]):
-        """
         Args:
-            tensor: Tensor image to be resized.
+            tensor: Image tensor to be resized.
             target: Dictionary of detection targets.
 
         Returns:
-            Resized Tensor image.
+            Resized image tensor.
         """
+        device = target["boxes"].device
         height, width = image.shape[-2:]
-        original_size = torch.tensor([height, width])
-        scale_y, scale_x = torch.tensor(self.output_size) / original_size
-        scale = torch.tensor([scale_x, scale_y, scale_x, scale_y], device=target["boxes"].device)
-        image = F.resize(image, self.output_size)
+        original_size = torch.tensor([height, width], device=device)
+        scale_y, scale_x = torch.tensor(self.image_size, device=device) / original_size
+        scale = torch.tensor([scale_x, scale_y, scale_x, scale_y], device=device)
+        image = F.resize(image, self.image_size)
         target["boxes"] = target["boxes"] * scale
         return image, target
 
 
-def run_cli():
-    from argparse import ArgumentParser
-
-    from pytorch_lightning import Trainer, seed_everything
-
-    from pl_bolts.datamodules import VOCDetectionDataModule
-    from pl_bolts.datamodules.vocdetection_datamodule import Compose
-    from pl_bolts.models.detection.yolo.yolo_config import YOLOConfiguration
-
-    seed_everything(42)
-
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--config",
-        type=str,
-        metavar="PATH",
-        required=True,
-        help="read model configuration from PATH",
-    )
-    parser.add_argument(
-        "--darknet-weights",
-        type=str,
-        metavar="PATH",
-        help="read the initial model weights from PATH in Darknet format",
-    )
-    parser.add_argument(
-        "--lr",
-        type=float,
-        metavar="LR",
-        default=0.0013,
-        help="learning rate after the warmup period",
-    )
-    parser.add_argument(
-        "--momentum",
-        type=float,
-        metavar="GAMMA",
-        default=0.9,
-        help="if nonzero, the optimizer uses momentum with factor GAMMA",
-    )
-    parser.add_argument(
-        "--weight-decay",
-        type=float,
-        metavar="LAMBDA",
-        default=0.0005,
-        help="if nonzero, the optimizer uses weight decay (L2 penalty) with factor LAMBDA",
-    )
-    parser.add_argument(
-        "--warmup-epochs",
-        type=int,
-        metavar="N",
-        default=1,
-        help="learning rate warmup period is N epochs",
-    )
-    parser.add_argument(
-        "--max-epochs",
-        type=int,
-        metavar="N",
-        default=300,
-        help="train at most N epochs",
-    )
-    parser.add_argument(
-        "--initial-lr",
-        type=float,
-        metavar="LR",
-        default=0.0,
-        help="learning rate before the warmup period",
-    )
-    parser.add_argument(
-        "--confidence-threshold",
-        type=float,
-        metavar="THRESHOLD",
-        default=0.001,
-        help="keep predictions only if the confidence is above THRESHOLD",
-    )
-    parser.add_argument(
-        "--nms-threshold",
-        type=float,
-        metavar="THRESHOLD",
-        default=0.45,
-        help="non-maximum suppression removes predicted boxes that have IoU greater than "
-        "THRESHOLD with a higher scoring box",
-    )
-    parser.add_argument(
-        "--max-predictions-per-image",
-        type=int,
-        metavar="N",
-        default=100,
-        help="keep at most N best predictions",
-    )
-
-    parser = VOCDetectionDataModule.add_argparse_args(parser)
-    parser = Trainer.add_argparse_args(parser)
-    args = parser.parse_args()
-
-    config = YOLOConfiguration(args.config)
-
-    transforms = [lambda image, target: (F.to_tensor(image), target), Resize((config.height, config.width))]
-    transforms = Compose(transforms)
-    datamodule = VOCDetectionDataModule.from_argparse_args(args, train_transforms=transforms, val_transforms=transforms)
-
-    optimizer_params = {"lr": args.lr, "momentum": args.momentum, "weight_decay": args.weight_decay}
-    lr_scheduler_params = {
-        "warmup_epochs": args.warmup_epochs,
-        "max_epochs": args.max_epochs,
-        "warmup_start_lr": args.initial_lr,
-    }
-    model = YOLO(
-        network=config.get_network(),
-        optimizer_params=optimizer_params,
-        lr_scheduler_params=lr_scheduler_params,
-        confidence_threshold=args.confidence_threshold,
-        nms_threshold=args.nms_threshold,
-        max_predictions_per_image=args.max_predictions_per_image,
-    )
-    if args.darknet_weights is not None:
-        with open(args.darknet_weights) as weight_file:
-            model.load_darknet_weights(weight_file)
-
-    trainer = Trainer.from_argparse_args(args)
-    trainer.fit(model, datamodule=datamodule)
-
-
 if __name__ == "__main__":
-    run_cli()
+    LightningCLI(DarknetYOLO, ResizedVOCDetectionDataModule, seed_everything_default=42)
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index fcb14eda9b..c15fa25f20 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -6,9 +6,17 @@
 from torch.utils.data import DataLoader
 
 from pl_bolts.datasets import DummyDetectionDataset
-from pl_bolts.models.detection import YOLO, FasterRCNN, RetinaNet, YOLOConfiguration
+from pl_bolts.models.detection import YOLO, DarknetConfiguration, FasterRCNN, RetinaNet
 from pl_bolts.models.detection.faster_rcnn import create_fasterrcnn_backbone
-from pl_bolts.models.detection.yolo.yolo_layers import _aligned_iou
+from pl_bolts.models.detection.yolo.target_matching import _sim_ota_match
+from pl_bolts.models.detection.yolo.utils import (
+    aligned_iou,
+    global_xy,
+    grid_centers,
+    grid_offsets,
+    iou_below,
+    is_inside_box,
+)
 from tests import TEST_ROOT
 
 
@@ -83,9 +91,122 @@ def test_fasterrcnn_pyt_module_bbone_train(tmpdir):
     trainer.fit(model, train_dl, valid_dl)
 
 
+@pytest.mark.parametrize("width,height", [(10, 5)])
+def test_grid_offsets(width: int, height: int):
+    size = torch.tensor([width, height])
+    offsets = grid_offsets(size)
+    assert offsets.shape == (height, width, 2)
+    assert torch.equal(offsets[0, :, 0], torch.arange(width, dtype=offsets.dtype))
+    assert torch.equal(offsets[0, :, 1], torch.zeros(width, dtype=offsets.dtype))
+    assert torch.equal(offsets[:, 0, 0], torch.zeros(height, dtype=offsets.dtype))
+    assert torch.equal(offsets[:, 0, 1], torch.arange(height, dtype=offsets.dtype))
+
+
+@pytest.mark.parametrize("width,height", [(10, 5)])
+def test_grid_centers(width: int, height: int):
+    size = torch.tensor([width, height])
+    centers = grid_centers(size)
+    assert centers.shape == (height, width, 2)
+    assert torch.equal(centers[0, :, 0], 0.5 + torch.arange(width, dtype=torch.float))
+    assert torch.equal(centers[0, :, 1], 0.5 * torch.ones(width))
+    assert torch.equal(centers[:, 0, 0], 0.5 * torch.ones(height))
+    assert torch.equal(centers[:, 0, 1], 0.5 + torch.arange(height, dtype=torch.float))
+
+
+def test_global_xy():
+    xy = torch.ones((2, 4, 4, 3, 2)) * 0.5  # 4x4 grid of coordinates to the center of the cell.
+    image_size = torch.tensor([400, 200])
+    xy = global_xy(xy, image_size)
+    assert xy.shape == (2, 4, 4, 3, 2)
+    assert torch.all(xy[:, :, 0, :, 0] == 50)
+    assert torch.all(xy[:, 0, :, :, 1] == 25)
+    assert torch.all(xy[:, :, 1, :, 0] == 150)
+    assert torch.all(xy[:, 1, :, :, 1] == 75)
+    assert torch.all(xy[:, :, 2, :, 0] == 250)
+    assert torch.all(xy[:, 2, :, :, 1] == 125)
+    assert torch.all(xy[:, :, 3, :, 0] == 350)
+    assert torch.all(xy[:, 3, :, :, 1] == 175)
+
+
+def test_is_inside_box():
+    """
+    centers:
+        [[1,1; 3,1; 5,1; 7,1; 9,1; 11,1; 13,1; 15,1; 17,1; 19,1]
+         [1,3; 3,3; 5,3; 7,3; 9,3; 11,3; 13,3; 15,3; 17,3; 19,3]
+         [1,5; 3,5; 5,5; 7,5; 9,5; 11,5; 13,5; 15,5; 17,5; 19,5]
+         [1,7; 3,7; 5,7; 7,7; 9,7; 11,7; 13,7; 15,7; 17,7; 19,7]
+         [1,9; 3,9; 5,9; 7,9; 9,9; 11,9; 13,9; 15,9; 17,9; 19,9]]
+
+    is_inside[0]:
+        [[F, F, F, F, F, F, F, F, F, F]
+         [F, T, T, F, F, F, F, F, F, F]
+         [F, T, T, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]]
+
+    is_inside[1]:
+        [[F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, T, T, F]]
+    """
+    size = torch.tensor([10, 5])
+    centers = grid_centers(size) * 2.0
+    centers = centers.view(-1, 2)
+    boxes = torch.tensor([[2, 2, 6, 6], [14, 8, 18, 10]])
+    is_inside = is_inside_box(centers, boxes).view(2, 5, 10)
+    assert torch.count_nonzero(is_inside) == 6
+    assert torch.all(is_inside[0, 1:3, 1:3])
+    assert torch.all(is_inside[1, 4, 7:9])
+
+
+def test_sim_ota_match():
+    # IoUs will determined that 2 and 1 predictions will be selected for the first and the second target.
+    ious = torch.tensor([[0.1, 0.1, 0.9, 0.9], [0.2, 0.3, 0.4, 0.1]])
+    # Costs will determine that the first and the last prediction will be selected for the first target, and the first
+    # prediction will be selected for the second target. Since the first prediction was selected for both targets, it
+    # will be matched to the best target only (the second one).
+    costs = torch.tensor([[0.3, 0.5, 0.4, 0.3], [0.1, 0.2, 0.5, 0.3]])
+    matched_preds, matched_targets = _sim_ota_match(costs, ious)
+    assert len(matched_preds) == 4
+    assert matched_preds[0]
+    assert not matched_preds[1]
+    assert not matched_preds[2]
+    assert matched_preds[3]
+    assert len(matched_targets) == 2  # Two predictions were matched.
+    assert matched_targets[0] == 1  # Which target was matched to the first prediction.
+    assert matched_targets[1] == 0  # Which target was matched to the last prediction.
+
+
+@pytest.mark.parametrize(
+    "dims1, dims2, expected_ious",
+    [
+        (
+            torch.tensor([[1.0, 1.0], [10.0, 1.0], [100.0, 10.0]]),
+            torch.tensor([[1.0, 10.0], [2.0, 20.0]]),
+            torch.tensor([[1.0 / 10.0, 1.0 / 40.0], [1.0 / 19.0, 2.0 / 48.0], [10.0 / 1000.0, 20.0 / 1020.0]]),
+        )
+    ],
+)
+def test_aligned_iou(dims1, dims2, expected_ious):
+    torch.testing.assert_allclose(aligned_iou(dims1, dims2), expected_ious)
+
+
+def test_iou_below():
+    tl = torch.rand((10, 10, 3, 2)) * 100
+    br = tl + 10
+    pred_boxes = torch.cat((tl, br), -1)
+    target_boxes = torch.stack((pred_boxes[1, 1, 0], pred_boxes[3, 5, 1]))
+    result = iou_below(pred_boxes, target_boxes, 0.9)
+    assert result.shape == (10, 10, 3)
+    assert not result[1, 1, 0]
+    assert not result[3, 5, 1]
+
+
 def test_yolo(tmpdir):
     config_path = Path(TEST_ROOT) / "data" / "yolo.cfg"
-    config = YOLOConfiguration(config_path)
+    config = DarknetConfiguration(config_path)
     model = YOLO(config.get_network())
 
     image = torch.rand(1, 3, 256, 256)
@@ -94,7 +215,7 @@ def test_yolo(tmpdir):
 
 def test_yolo_train(tmpdir):
     config_path = Path(TEST_ROOT) / "data" / "yolo.cfg"
-    config = YOLOConfiguration(config_path)
+    config = DarknetConfiguration(config_path)
     model = YOLO(config.get_network())
 
     train_dl = DataLoader(DummyDetectionDataset(), collate_fn=_collate_fn)
@@ -102,17 +223,3 @@ def test_yolo_train(tmpdir):
 
     trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
     trainer.fit(model, train_dataloader=train_dl, val_dataloaders=valid_dl)
-
-
-@pytest.mark.parametrize(
-    "dims1, dims2, expected_ious",
-    [
-        (
-            torch.tensor([[1.0, 1.0], [10.0, 1.0], [100.0, 10.0]]),
-            torch.tensor([[1.0, 10.0], [2.0, 20.0]]),
-            torch.tensor([[1.0 / 10.0, 1.0 / 40.0], [1.0 / 19.0, 2.0 / 48.0], [10.0 / 1000.0, 20.0 / 1020.0]]),
-        )
-    ],
-)
-def test_aligned_iou(dims1, dims2, expected_ious):
-    torch.testing.assert_allclose(_aligned_iou(dims1, dims2), expected_ious)

From 476adb97070c68b71e8361c8a0d668eb8f1a3603 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Tue, 25 Jan 2022 19:37:37 +0200
Subject: [PATCH 02/76] Fixed ONNX export

---
 .../detection/yolo/darknet_configuration.py   | 25 +++----
 .../models/detection/yolo/target_matching.py  |  4 +-
 pl_bolts/models/detection/yolo/utils.py       | 37 ++++++-----
 pl_bolts/models/detection/yolo/yolo_layers.py | 66 +++++++++----------
 pl_bolts/models/detection/yolo/yolo_loss.py   |  4 +-
 pl_bolts/models/detection/yolo/yolo_module.py | 26 ++++++--
 6 files changed, 91 insertions(+), 71 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/darknet_configuration.py b/pl_bolts/models/detection/yolo/darknet_configuration.py
index 95064034b8..e2f1474d2a 100644
--- a/pl_bolts/models/detection/yolo/darknet_configuration.py
+++ b/pl_bolts/models/detection/yolo/darknet_configuration.py
@@ -18,13 +18,13 @@
 class DarknetConfiguration:
     """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation.
 
-    The :func:`~pl_bolts.models.detection.yolo.yolo_config.YOLOConfiguration.get_network` method
-    returns a PyTorch module list that can be used to construct a YOLO model.
+    The :func:`~pl_bolts.models.detection.yolo.yolo_config.YOLOConfiguration.get_network` method returns a PyTorch
+    module list that can be used to construct a YOLO model.
     """
 
     def __init__(self, path: str) -> None:
-        """Saves the variables from the first configuration section to attributes of this object, and the rest of
-        the sections to the ``layer_configs`` list.
+        """Saves the variables from the first configuration section to attributes of this object, and the rest of the
+        sections to the ``layer_configs`` list.
 
         Args:
             path: Path to a configuration file
@@ -40,8 +40,8 @@ def __init__(self, path: str) -> None:
         self.layer_configs = sections[1:]
 
     def get_network(self, **kwargs) -> nn.ModuleList:
-        """Iterates through the layers from the configuration and creates corresponding PyTorch modules. Returns
-        the network structure that can be used to create a YOLO model.
+        """Iterates through the layers from the configuration and creates corresponding PyTorch modules. Returns the
+        network structure that can be used to create a YOLO model.
 
         Returns:
             A :class:`~torch.nn.ModuleList` that defines the YOLO network.
@@ -161,8 +161,8 @@ def _create_layer(config: Dict[str, Any], num_inputs: List[int], **kwargs) -> Tu
         num_inputs: Number of channels in the input of every layer up to this layer.
 
     Returns:
-        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the
-        number of channels in its output.
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
     """
     create_func = {
         "convolutional": _create_convolutional,
@@ -222,8 +222,8 @@ def _create_maxpool(config: Dict[str, Any], num_inputs: List[int], **kwargs):
     if kernel_size % 2 == 1:
         return maxpool, num_inputs[-1]
 
-    # If the kernel size is an even number, we need one cell of extra padding, on top of the padding
-    # added by MaxPool2d on both sides.
+    # If the kernel size is an even number, we need one cell of extra padding, on top of the padding added by MaxPool2d
+    # on both sides.
     layer = nn.Sequential()
     layer.add_module("pad", nn.ZeroPad2d((0, 1, 0, 1)))
     layer.add_module("maxpool", maxpool)
@@ -293,7 +293,10 @@ def _create_yolo(
     if sum(var is not None for var in (match_sim_ota, match_size_ratio, match_iou_threshold)) > 1:
         raise ValueError("More than one matching algorithm specified.")
     if match_sim_ota:
-        matching_func = SimOTAMatching(loss_func)
+        sim_ota_loss_func = LossFunction(
+            overlap_loss, None, overlap_loss_multiplier, class_loss_multiplier, confidence_loss_multiplier
+        )
+        matching_func = SimOTAMatching(sim_ota_loss_func)
     elif match_size_ratio is not None:
         matching_func = SizeRatioMatching(match_size_ratio, anchor_dims, anchor_ids, ignore_iou_threshold)
     elif match_iou_threshold is not None:
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index 0181ff4a97..2c7fe10daf 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -49,6 +49,7 @@ def __call__(
         image_size: Tensor,
     ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
         """For each target, selects predictions from the same grid cell, where the center of the target box is.
+
         Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
         predictions within the grid cell.
 
@@ -162,8 +163,7 @@ def match(self, wh):
 
 
 class SizeRatioMatching(ShapeMatching):
-    """For each target, select those prior shapes, whose width and height relative to the target is below given
-    ratio.
+    """For each target, select those prior shapes, whose width and height relative to the target is below given ratio.
 
     This is the matching rule used by Ultralytics YOLOv5 implementation.
 
diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
index f23869c974..6000371f02 100644
--- a/pl_bolts/models/detection/yolo/utils.py
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -19,7 +19,7 @@ def grid_offsets(grid_size: Tensor) -> Tensor:
         The width and height of the grid in a tensor.
 
     Returns:
-        A ``[height, width, 2]`` tensor containing the grid cell (x, y) offsets.
+        A ``[height, width, 2]`` tensor containing the grid cell `(x, y)` offsets.
     """
     x_range = torch.arange(grid_size[0], device=grid_size.device)
     y_range = torch.arange(grid_size[1], device=grid_size.device)
@@ -36,41 +36,46 @@ def grid_centers(grid_size: Tensor) -> Tensor:
     return grid_offsets(grid_size) + 0.5
 
 
+@torch.jit.script
 def global_xy(xy: Tensor, image_size: Tensor) -> Tensor:
     """Adds offsets to the predicted box center coordinates to obtain global coordinates to the image.
 
-    The predicted coordinates are interpreted as coordinates inside a grid cell whose width and
-    height is 1. Adding offset to the cell, dividing by the grid size, and multiplying by the
-    image size, we get global coordinates in the image scale.
+    The predicted coordinates are interpreted as coordinates inside a grid cell whose width and height is 1. Adding
+    offset to the cell, dividing by the grid size, and multiplying by the image size, we get global coordinates in the
+    image scale.
+
+    The function needs the ``@torch.jit.script`` decorator in order for ONNX generation to work. The tracing based
+    generator will loose track of e.g. ``xy.shape[1]`` and treat it as a Python variable and not a tensor. This will
+    cause the dimension to be treated as a constant in the model, which prevents dynamic input sizes.
 
     Args:
-        xy: The predicted center coordinates before scaling. Values from zero to one in a
-            tensor sized ``[batch_size, height, width, boxes_per_cell, 2]``.
+        xy: The predicted center coordinates before scaling. Values from zero to one in a tensor sized
+            ``[batch_size, height, width, boxes_per_cell, 2]``.
         image_size: Width and height in a vector that will be used to scale the coordinates.
 
     Returns:
-        Global coordinates scaled to the size of the network input image, in a tensor with the
-        same shape as the input tensor.
+        Global coordinates scaled to the size of the network input image, in a tensor with the same shape as the input
+        tensor.
     """
     height = xy.shape[1]
     width = xy.shape[2]
     grid_size = torch.tensor([width, height], device=xy.device)
-    offset = grid_offsets(grid_size).unsqueeze(2)  # [height, width, 1, 2]
+    # Scripting requires explicit conversion to a floating point type.
+    offset = grid_offsets(grid_size).to(xy.dtype).unsqueeze(2)  # [height, width, 1, 2]
     scale = torch.true_divide(image_size, grid_size)
     return (xy + offset) * scale
 
 
 def aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
-    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at
-    the same coordinates.
+    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at the
+    same coordinates.
 
     Args:
         dims1: Width and height of `N` boxes. Tensor of size ``[N, 2]``.
         dims2: Width and height of `M` boxes. Tensor of size ``[M, 2]``.
 
     Returns:
-        Tensor of size ``[N, M]`` containing the pairwise IoU values for every element in
-        ``dims1`` and ``dims2``
+        Tensor of size ``[N, M]`` containing the pairwise IoU values for every element in ``dims1`` and ``dims2``
     """
     area1 = dims1[:, 0] * dims1[:, 1]  # [N]
     area2 = dims2[:, 0] * dims2[:, 1]  # [M]
@@ -83,8 +88,8 @@ def aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
 
 
 def iou_below(pred_boxes: Tensor, target_boxes: Tensor, threshold: float) -> List[Tensor]:
-    """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any target
-    significantly (IoU greater than ``threshold``).
+    """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any target significantly
+    (IoU greater than ``threshold``).
 
     Args:
         pred_boxes: The predicted corner coordinates. Tensor of size ``[height, width, boxes_per_cell, 4]``.
@@ -110,7 +115,7 @@ def is_inside_box(points, boxes):
         boxes: box (x1, y1, x2, y2) coordinates, [boxes, 4]
 
     Returns:
-        A tensor shaped ``[boxes, points]`` containing pairwise truth values of whether the points are inside the boxes
+        A tensor shaped ``[boxes, points]`` containing pairwise truth values of whether the points are inside the boxes.
     """
     points = points.unsqueeze(0)  # [1, points, 2]
     boxes = boxes.unsqueeze(1)  # [boxes, 1, 4]
diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/yolo_layers.py
index 0c8cf8771c..d17b2fe242 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/yolo_layers.py
@@ -13,8 +13,19 @@
 class DetectionLayer(nn.Module):
     """A YOLO detection layer.
 
-    A YOLO model has usually 1 - 3 detection layers at different
-    resolutions. The loss should be summed from all of them.
+    A YOLO model has usually 1 - 3 detection layers at different resolutions. The loss is summed from all of them.
+
+    Args:
+        num_classes: Number of different classes that this layer predicts.
+        anchor_dims: A list of the anchor box dimensions for this layer. The list should contain (width, height) tuples
+            in the network input resolution (relative to the width and height defined in the configuration file).
+        matching_func: The matching algorithm to be used for assigning targets to anchors.
+        loss_func: ``LossFunction`` object for calculating the losses.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+        input_is_normalized: The input is normalized by logistic activation in the previous layer. In this case the
+            detection layer will not take the sigmoid of the coordinate and probability predictions, and the width and
+            height are scaled up so that the maximum value is four times the anchor dimension.
     """
 
     def __init__(
@@ -26,21 +37,6 @@ def __init__(
         xy_scale: float = 1.0,
         input_is_normalized: bool = False,
     ) -> None:
-        """
-        Args:
-            num_classes: Number of different classes that this layer predicts.
-            anchor_dims: A list of the anchor box dimensions for this layer. The list should
-                contain (width, height) tuples in the network input resolution (relative to the
-                width and height defined in the configuration file).
-            matching_func: The matching algorithm to be used for assigning targets to anchors.
-            loss_func: ``LossFunction`` object for calculating the losses.
-            xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor.
-                Using a value > 1.0 helps to produce coordinate values close to one.
-            input_is_normalized: The input is normalized by logistic activation in the previous
-                layer. In this case the detection layer will not take the sigmoid of the coordinate
-                and probability predictions, and the width and height are scaled up so that the
-                maximum value is four times the anchor dimension
-        """
         super().__init__()
 
         if not _TORCHVISION_AVAILABLE:  # pragma: no cover
@@ -88,9 +84,9 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         x = x.permute(0, 2, 3, 1)  # [batch_size, height, width, boxes_per_cell * num_attrs]
         x = x.view(batch_size, height, width, boxes_per_cell, num_attrs)
 
-        # Take the sigmoid of the bounding box coordinates, confidence score, and class
-        # probabilities, unless the input is normalized by the previous layer activation. Confidence
-        # and class losses use the unnormalized values if possible.
+        # Take the sigmoid of the bounding box coordinates, confidence score, and class probabilities, unless the input
+        # is normalized by the previous layer activation. Confidence and class losses use the unnormalized values if
+        # possible.
         norm_x = x if self.input_is_normalized else torch.sigmoid(x)
         xy = norm_x[..., :2]
         wh = x[..., 2:4]
@@ -99,9 +95,8 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         norm_confidence = norm_x[..., 4]
         norm_classprob = norm_x[..., 5:]
 
-        # Eliminate grid sensitivity. The previous layer should output extremely high values for
-        # the sigmoid to produce x/y coordinates close to one. YOLOv4 solves this by scaling the
-        # x/y coordinates.
+        # Eliminate grid sensitivity. The previous layer should output extremely high values for the sigmoid to produce
+        # x/y coordinates close to one. YOLOv4 solves this by scaling the x/y coordinates.
         xy = xy * self.xy_scale - 0.5 * (self.xy_scale - 1)
 
         image_xy = global_xy(xy, image_size)
@@ -183,15 +178,15 @@ def forward(self, x):
 
 
 class RouteLayer(nn.Module):
-    """Route layer concatenates the output (or part of it) from given layers."""
+    """Route layer concatenates the output (or part of it) from given layers.
+
+    Args:
+        source_layers: Indices of the layers whose output will be concatenated.
+        num_chunks: Layer outputs will be split into this number of chunks.
+        chunk_idx: Only the chunks with this index will be concatenated.
+    """
 
     def __init__(self, source_layers: List[int], num_chunks: int, chunk_idx: int) -> None:
-        """
-        Args:
-            source_layers: Indices of the layers whose output will be concatenated.
-            num_chunks: Layer outputs will be split into this number of chunks.
-            chunk_idx: Only the chunks with this index will be concatenated.
-        """
         super().__init__()
         self.source_layers = source_layers
         self.num_chunks = num_chunks
@@ -203,14 +198,13 @@ def forward(self, x, outputs):
 
 
 class ShortcutLayer(nn.Module):
-    """Shortcut layer adds a residual connection from the source layer."""
+    """Shortcut layer adds a residual connection from the source layer.
+
+    Args:
+        source_layer: Index of the layer whose output will be added to the output of the previous layer.
+    """
 
     def __init__(self, source_layer: int) -> None:
-        """
-        Args:
-            source_layer: Index of the layer whose output will be added to the output of the
-                previous layer.
-        """
         super().__init__()
         self.source_layer = source_layer
 
diff --git a/pl_bolts/models/detection/yolo/yolo_loss.py b/pl_bolts/models/detection/yolo/yolo_loss.py
index 5f3f38476a..99e9293556 100644
--- a/pl_bolts/models/detection/yolo/yolo_loss.py
+++ b/pl_bolts/models/detection/yolo/yolo_loss.py
@@ -16,8 +16,8 @@ def _upcast(t: Tensor) -> Tensor:
 
 
 def complete_iou(boxes1: Tensor, boxes2: Tensor, distance_only: bool = False) -> Tensor:
-    """Returns the complete intersection-over-union between two sets of boxes. Both sets of boxes are expected to
-    be in `(x1, y1, x2, y2)` format.
+    """Returns the complete intersection-over-union between two sets of boxes. Both sets of boxes are expected to be in
+    `(x1, y1, x2, y2)` format.
 
     Args:
         boxes1: Box coordinates in a tensor of size ``[N, 4]``.
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index fac07e7f4f..cde9e308d1 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -33,7 +33,8 @@ class YOLO(LightningModule):
 
     *YOLOv4 paper*: `Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2004.10934>`_
 
-    *Scaled-YOLOv4 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2011.08036>`_
+    *Scaled-YOLOv4 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao
+    <https://arxiv.org/abs/2011.08036>`_
 
     *YOLOX paper*: `Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun <https://arxiv.org/abs/2107.08430>`_
 
@@ -136,9 +137,26 @@ def forward(self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = N
         losses = []  # Losses from detection layers
         hits = []  # Number of targets each detection layer was responsible for
 
-        image_height = images.shape[2]
-        image_width = images.shape[3]
-        image_size = torch.tensor([image_width, image_height], device=images.device)
+        @torch.jit.script
+        def get_image_size(images: Tensor) -> Tensor:
+            """Get the image size from an input tensor.
+
+            The function needs the ``@torch.jit.script`` decorator in order for ONNX generation to work. The tracing
+            based generator will loose track of e.g. ``images.shape[1]`` and treat it as a Python variable and not a
+            tensor. This will cause the dimension to be treated as a constant in the model, which prevents dynamic
+            input sizes.
+
+            Args:
+                images: An image batch to take the width and height from.
+
+            Returns:
+                A tensor that contains the image width and height.
+            """
+            height = images.shape[2]
+            width = images.shape[3]
+            return torch.tensor([width, height], device=images.device)
+
+        image_size = get_image_size(images)
 
         x = images
         for layer in self.network:

From 8d70ca1bed85ea146a30ed052765cc6ffc5c9ab2 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 27 Jan 2022 08:40:48 +0200
Subject: [PATCH 03/76] meshgrid() call made future-proof by using the indexing
 argument

---
 .../detection/yolo/darknet_configuration.py   |  8 ++++----
 .../models/detection/yolo/target_matching.py  |  3 ++-
 pl_bolts/models/detection/yolo/utils.py       | 19 ++++++++++++++-----
 pl_bolts/models/detection/yolo/yolo_loss.py   |  4 ++--
 pl_bolts/models/detection/yolo/yolo_module.py |  8 ++++----
 5 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/darknet_configuration.py b/pl_bolts/models/detection/yolo/darknet_configuration.py
index e2f1474d2a..ac4626ac08 100644
--- a/pl_bolts/models/detection/yolo/darknet_configuration.py
+++ b/pl_bolts/models/detection/yolo/darknet_configuration.py
@@ -23,8 +23,8 @@ class DarknetConfiguration:
     """
 
     def __init__(self, path: str) -> None:
-        """Saves the variables from the first configuration section to attributes of this object, and the rest of the
-        sections to the ``layer_configs`` list.
+        """Saves the variables from the first configuration section to attributes of this object, and the rest of
+        the sections to the ``layer_configs`` list.
 
         Args:
             path: Path to a configuration file
@@ -40,8 +40,8 @@ def __init__(self, path: str) -> None:
         self.layer_configs = sections[1:]
 
     def get_network(self, **kwargs) -> nn.ModuleList:
-        """Iterates through the layers from the configuration and creates corresponding PyTorch modules. Returns the
-        network structure that can be used to create a YOLO model.
+        """Iterates through the layers from the configuration and creates corresponding PyTorch modules. Returns
+        the network structure that can be used to create a YOLO model.
 
         Returns:
             A :class:`~torch.nn.ModuleList` that defines the YOLO network.
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index 2c7fe10daf..483729770f 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -163,7 +163,8 @@ def match(self, wh):
 
 
 class SizeRatioMatching(ShapeMatching):
-    """For each target, select those prior shapes, whose width and height relative to the target is below given ratio.
+    """For each target, select those prior shapes, whose width and height relative to the target is below given
+    ratio.
 
     This is the matching rule used by Ultralytics YOLOv5 implementation.
 
diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
index 6000371f02..6c5b6212aa 100644
--- a/pl_bolts/models/detection/yolo/utils.py
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -1,6 +1,7 @@
 from typing import List
 
 import torch
+from packaging import version
 from torch import Tensor
 
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
@@ -12,6 +13,14 @@
     warn_missing_pkg("torchvision")
 
 
+# PyTorch 1.10 introduced the argument "indexing" and deprecated calling without the argument. Since we call it inside
+# a "@torch.jit.script" function, it's difficult to make this decision at call time.
+if version.parse(torch.__version__) >= version.parse("1.10.0"):
+    meshgrid = lambda *tensors: torch.meshgrid(*tensors, indexing="ij")
+else:
+    meshgrid = torch.meshgrid
+
+
 def grid_offsets(grid_size: Tensor) -> Tensor:
     """Given a grid size, returns a tensor containing offsets to the grid cells.
 
@@ -23,7 +32,7 @@ def grid_offsets(grid_size: Tensor) -> Tensor:
     """
     x_range = torch.arange(grid_size[0], device=grid_size.device)
     y_range = torch.arange(grid_size[1], device=grid_size.device)
-    grid_y, grid_x = torch.meshgrid(y_range, x_range)
+    grid_y, grid_x = meshgrid(y_range, x_range)
     return torch.stack((grid_x, grid_y), -1)
 
 
@@ -67,8 +76,8 @@ def global_xy(xy: Tensor, image_size: Tensor) -> Tensor:
 
 
 def aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
-    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at the
-    same coordinates.
+    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at
+    the same coordinates.
 
     Args:
         dims1: Width and height of `N` boxes. Tensor of size ``[N, 2]``.
@@ -88,8 +97,8 @@ def aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
 
 
 def iou_below(pred_boxes: Tensor, target_boxes: Tensor, threshold: float) -> List[Tensor]:
-    """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any target significantly
-    (IoU greater than ``threshold``).
+    """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any target
+    significantly (IoU greater than ``threshold``).
 
     Args:
         pred_boxes: The predicted corner coordinates. Tensor of size ``[height, width, boxes_per_cell, 4]``.
diff --git a/pl_bolts/models/detection/yolo/yolo_loss.py b/pl_bolts/models/detection/yolo/yolo_loss.py
index 99e9293556..5f3f38476a 100644
--- a/pl_bolts/models/detection/yolo/yolo_loss.py
+++ b/pl_bolts/models/detection/yolo/yolo_loss.py
@@ -16,8 +16,8 @@ def _upcast(t: Tensor) -> Tensor:
 
 
 def complete_iou(boxes1: Tensor, boxes2: Tensor, distance_only: bool = False) -> Tensor:
-    """Returns the complete intersection-over-union between two sets of boxes. Both sets of boxes are expected to be in
-    `(x1, y1, x2, y2)` format.
+    """Returns the complete intersection-over-union between two sets of boxes. Both sets of boxes are expected to
+    be in `(x1, y1, x2, y2)` format.
 
     Args:
         boxes1: Box coordinates in a tensor of size ``[N, 4]``.
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index cde9e308d1..eaf5a38a6c 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -26,8 +26,8 @@
 
 
 class YOLO(LightningModule):
-    """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4, YOLOv5,
-    Scaled-YOLOv4, and YOLOX.
+    """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4,
+    YOLOv5, Scaled-YOLOv4, and YOLOX.
 
     *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`_
 
@@ -517,8 +517,8 @@ def __init__(
 
 
 class ResizedVOCDetectionDataModule(VOCDetectionDataModule):
-    """A subclass of VOCDetectionDataModule that resizes the images to a specific size. YOLO expectes the image size to
-    be divisible by the ratio in which the network downsamples the image.
+    """A subclass of VOCDetectionDataModule that resizes the images to a specific size. YOLO expectes the image
+    size to be divisible by the ratio in which the network downsamples the image.
 
     Args:
         width: Resize images to this width.

From 35a98ba13a79a9e252f2582ed16b8d32661aebdf Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 27 Jan 2022 09:43:22 +0200
Subject: [PATCH 04/76] torch.jit.script fails with a lambda function

---
 pl_bolts/models/detection/yolo/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
index 6c5b6212aa..7cb5e615a3 100644
--- a/pl_bolts/models/detection/yolo/utils.py
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -16,7 +16,8 @@
 # PyTorch 1.10 introduced the argument "indexing" and deprecated calling without the argument. Since we call it inside
 # a "@torch.jit.script" function, it's difficult to make this decision at call time.
 if version.parse(torch.__version__) >= version.parse("1.10.0"):
-    meshgrid = lambda *tensors: torch.meshgrid(*tensors, indexing="ij")
+    def meshgrid(x, y):
+        return torch.meshgrid((x, y), indexing="ij")
 else:
     meshgrid = torch.meshgrid
 

From a91793c102a8c805612e1ad440f8d9cd822d91ef Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 7 Mar 2022 16:01:27 +0200
Subject: [PATCH 05/76] YOLOV4Tiny, YOLOV5, and YOLOX network architectures in
 plain PyTorch

---
 pl_bolts/models/detection/__init__.py         |  16 +-
 .../detection/yolo/darknet_configuration.py   | 316 -------
 .../models/detection/yolo/darknet_network.py  | 384 +++++++++
 .../models/detection/yolo/target_matching.py  | 114 +--
 .../models/detection/yolo/torch_networks.py   | 801 ++++++++++++++++++
 pl_bolts/models/detection/yolo/utils.py       |  23 +-
 pl_bolts/models/detection/yolo/yolo_layers.py | 186 +++-
 pl_bolts/models/detection/yolo/yolo_loss.py   |  15 +-
 pl_bolts/models/detection/yolo/yolo_module.py | 140 +--
 tests/models/test_detection.py                |  85 +-
 10 files changed, 1555 insertions(+), 525 deletions(-)
 delete mode 100644 pl_bolts/models/detection/yolo/darknet_configuration.py
 create mode 100644 pl_bolts/models/detection/yolo/darknet_network.py
 create mode 100644 pl_bolts/models/detection/yolo/torch_networks.py

diff --git a/pl_bolts/models/detection/__init__.py b/pl_bolts/models/detection/__init__.py
index 8defadf410..1dca7e215e 100644
--- a/pl_bolts/models/detection/__init__.py
+++ b/pl_bolts/models/detection/__init__.py
@@ -1,13 +1,25 @@
 from pl_bolts.models.detection import components
 from pl_bolts.models.detection.faster_rcnn import FasterRCNN
 from pl_bolts.models.detection.retinanet import RetinaNet
-from pl_bolts.models.detection.yolo.darknet_configuration import DarknetConfiguration
+from pl_bolts.models.detection.yolo.darknet_network import DarknetNetwork
+from pl_bolts.models.detection.yolo.torch_networks import (
+    CSPBackbone,
+    TinyBackbone,
+    YOLOV4TinyNetwork,
+    YOLOV5Network,
+    YOLOXNetwork,
+)
 from pl_bolts.models.detection.yolo.yolo_module import YOLO
 
 __all__ = [
     "components",
     "FasterRCNN",
-    "DarknetConfiguration",
     "YOLO",
+    "DarknetNetwork",
+    "YOLOV4TinyNetwork",
+    "YOLOV5Network",
+    "YOLOXNetwork",
+    "TinyBackbone",
+    "CSPBackbone",
     "RetinaNet",
 ]
diff --git a/pl_bolts/models/detection/yolo/darknet_configuration.py b/pl_bolts/models/detection/yolo/darknet_configuration.py
deleted file mode 100644
index ac4626ac08..0000000000
--- a/pl_bolts/models/detection/yolo/darknet_configuration.py
+++ /dev/null
@@ -1,316 +0,0 @@
-import re
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
-from warnings import warn
-
-import torch.nn as nn
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-
-from pl_bolts.models.detection.yolo import yolo_layers
-from pl_bolts.models.detection.yolo.target_matching import (
-    HighestIoUMatching,
-    IoUThresholdMatching,
-    SimOTAMatching,
-    SizeRatioMatching,
-)
-from pl_bolts.models.detection.yolo.yolo_loss import LossFunction
-
-
-class DarknetConfiguration:
-    """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation.
-
-    The :func:`~pl_bolts.models.detection.yolo.yolo_config.YOLOConfiguration.get_network` method returns a PyTorch
-    module list that can be used to construct a YOLO model.
-    """
-
-    def __init__(self, path: str) -> None:
-        """Saves the variables from the first configuration section to attributes of this object, and the rest of
-        the sections to the ``layer_configs`` list.
-
-        Args:
-            path: Path to a configuration file
-        """
-        with open(path) as config_file:
-            sections = self._read_file(config_file)
-
-        if len(sections) < 2:
-            raise MisconfigurationException("The model configuration file should include at least two sections.")
-
-        self.__dict__.update(sections[0])
-        self.global_config = sections[0]
-        self.layer_configs = sections[1:]
-
-    def get_network(self, **kwargs) -> nn.ModuleList:
-        """Iterates through the layers from the configuration and creates corresponding PyTorch modules. Returns
-        the network structure that can be used to create a YOLO model.
-
-        Returns:
-            A :class:`~torch.nn.ModuleList` that defines the YOLO network.
-        """
-        result = nn.ModuleList()
-        num_inputs = [3]  # Number of channels in the input of every layer up to the current layer
-        for layer_config in self.layer_configs:
-            config = {**self.global_config, **layer_config}
-            module, num_outputs = _create_layer(config, num_inputs, **kwargs)
-            result.append(module)
-            num_inputs.append(num_outputs)
-        return result
-
-    def _read_file(self, config_file: Iterable[str]) -> List[Dict[str, Any]]:
-        """Reads a YOLOv4 network configuration file and returns a list of configuration sections.
-
-        Args:
-            config_file: The configuration file to read.
-
-        Returns:
-            A list of configuration sections.
-        """
-        section_re = re.compile(r"\[([^]]+)\]")
-        list_variables = ("layers", "anchors", "mask", "scales")
-        variable_types = {
-            "activation": str,
-            "anchors": int,
-            "angle": float,
-            "batch": int,
-            "batch_normalize": bool,
-            "beta_nms": float,
-            "burn_in": int,
-            "channels": int,
-            "classes": int,
-            "cls_normalizer": float,
-            "decay": float,
-            "exposure": float,
-            "filters": int,
-            "from": int,
-            "groups": int,
-            "group_id": int,
-            "height": int,
-            "hue": float,
-            "ignore_thresh": float,
-            "iou_loss": str,
-            "iou_normalizer": float,
-            "iou_thresh": float,
-            "jitter": float,
-            "layers": int,
-            "learning_rate": float,
-            "mask": int,
-            "max_batches": int,
-            "max_delta": float,
-            "momentum": float,
-            "mosaic": bool,
-            "new_coords": int,
-            "nms_kind": str,
-            "num": int,
-            "obj_normalizer": float,
-            "pad": bool,
-            "policy": str,
-            "random": bool,
-            "resize": float,
-            "saturation": float,
-            "scales": float,
-            "scale_x_y": float,
-            "size": int,
-            "steps": str,
-            "stride": int,
-            "subdivisions": int,
-            "truth_thresh": float,
-            "width": int,
-        }
-
-        section = None
-        sections = []
-
-        def convert(key, value):
-            """Converts a value to the correct type based on key."""
-            if key not in variable_types:
-                warn("Unknown YOLO configuration variable: " + key)
-                return key, value
-            if key in list_variables:
-                value = [variable_types[key](v) for v in value.split(",")]
-            else:
-                value = variable_types[key](value)
-            return key, value
-
-        for line in config_file:
-            line = line.strip()
-            if (not line) or (line[0] == "#"):
-                continue
-
-            section_match = section_re.match(line)
-            if section_match:
-                if section is not None:
-                    sections.append(section)
-                section = {"type": section_match.group(1)}
-            else:
-                key, value = line.split("=")
-                key = key.rstrip()
-                value = value.lstrip()
-                key, value = convert(key, value)
-                section[key] = value
-        if section is not None:
-            sections.append(section)
-
-        return sections
-
-
-def _create_layer(config: Dict[str, Any], num_inputs: List[int], **kwargs) -> Tuple[nn.Module, int]:
-    """Calls one of the ``_create_<layertype>(config, num_inputs)`` functions to create a PyTorch module from the
-    layer config.
-
-    Args:
-        config: Dictionary of configuration options for this layer.
-        num_inputs: Number of channels in the input of every layer up to this layer.
-
-    Returns:
-        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
-        its output.
-    """
-    create_func = {
-        "convolutional": _create_convolutional,
-        "maxpool": _create_maxpool,
-        "route": _create_route,
-        "shortcut": _create_shortcut,
-        "upsample": _create_upsample,
-        "yolo": _create_yolo,
-    }
-    return create_func[config["type"]](config, num_inputs, **kwargs)
-
-
-def _create_convolutional(config: Dict[str, Any], num_inputs: List[int], **kwargs):
-    layer = nn.Sequential()
-
-    batch_normalize = config.get("batch_normalize", False)
-    padding = (config["size"] - 1) // 2 if config["pad"] else 0
-
-    conv = nn.Conv2d(
-        num_inputs[-1], config["filters"], config["size"], config["stride"], padding, bias=not batch_normalize
-    )
-    layer.add_module("conv", conv)
-
-    if batch_normalize:
-        bn = nn.BatchNorm2d(config["filters"])  # YOLOv5: eps=0.001, momentum=0.03
-        layer.add_module("bn", bn)
-
-    activation_name = config["activation"]
-    if activation_name == "leaky":
-        leakyrelu = nn.LeakyReLU(0.1, inplace=True)
-        layer.add_module("leakyrelu", leakyrelu)
-    elif activation_name == "mish":
-        mish = yolo_layers.Mish()
-        layer.add_module("mish", mish)
-    elif activation_name == "swish":
-        swish = nn.SiLU(inplace=True)
-        layer.add_module("swish", swish)
-    elif activation_name == "logistic":
-        logistic = nn.Sigmoid()
-        layer.add_module("logistic", logistic)
-    elif activation_name == "linear":
-        pass
-    else:
-        raise MisconfigurationException("Unknown activation: " + activation_name)
-
-    return layer, config["filters"]
-
-
-def _create_maxpool(config: Dict[str, Any], num_inputs: List[int], **kwargs):
-    """Creates a max pooling layer.
-
-    Padding is added so that the output resolution will be the input resolution divided by stride, rounded upwards.
-    """
-    kernel_size = config["size"]
-    padding = (kernel_size - 1) // 2
-    maxpool = nn.MaxPool2d(kernel_size, config["stride"], padding)
-    if kernel_size % 2 == 1:
-        return maxpool, num_inputs[-1]
-
-    # If the kernel size is an even number, we need one cell of extra padding, on top of the padding added by MaxPool2d
-    # on both sides.
-    layer = nn.Sequential()
-    layer.add_module("pad", nn.ZeroPad2d((0, 1, 0, 1)))
-    layer.add_module("maxpool", maxpool)
-    return layer, num_inputs[-1]
-
-
-def _create_route(config, num_inputs: List[int], **kwargs):
-    num_chunks = config.get("groups", 1)
-    chunk_idx = config.get("group_id", 0)
-
-    # 0 is the first layer, -1 is the previous layer
-    last = len(num_inputs) - 1
-    source_layers = [layer if layer >= 0 else last + layer for layer in config["layers"]]
-
-    layer = yolo_layers.RouteLayer(source_layers, num_chunks, chunk_idx)
-
-    # The number of outputs of a source layer is the number of inputs of the next layer.
-    num_outputs = sum(num_inputs[layer + 1] // num_chunks for layer in source_layers)
-
-    return layer, num_outputs
-
-
-def _create_shortcut(config: Dict[str, Any], num_inputs: List[int], **kwargs):
-    layer = yolo_layers.ShortcutLayer(config["from"])
-    return layer, num_inputs[-1]
-
-
-def _create_upsample(config: Dict[str, Any], num_inputs: List[int], **kwargs):
-    layer = nn.Upsample(scale_factor=config["stride"], mode="nearest")
-    return layer, num_inputs[-1]
-
-
-def _create_yolo(
-    config: Dict[str, Any],
-    num_inputs: List[int],
-    match_sim_ota: bool = False,
-    match_size_ratio: Optional[float] = None,
-    match_iou_threshold: Optional[float] = None,
-    ignore_iou_threshold: Optional[float] = None,
-    overlap_loss: Optional[Union[str, Callable]] = None,
-    predict_overlap: Optional[float] = None,
-    overlap_loss_multiplier: Optional[float] = None,
-    class_loss_multiplier: Optional[float] = None,
-    confidence_loss_multiplier: Optional[float] = None,
-    **kwargs,
-):
-    # The "anchors" list alternates width and height.
-    anchor_dims = config["anchors"]
-    anchor_dims = [(anchor_dims[i], anchor_dims[i + 1]) for i in range(0, len(anchor_dims), 2)]
-    anchor_ids = config["mask"]
-
-    xy_scale = config.get("scale_x_y", 1.0)
-    input_is_normalized = config.get("new_coords", 0) > 0
-    ignore_iou_threshold = config.get("ignore_thresh", 1.0) if ignore_iou_threshold is None else ignore_iou_threshold
-
-    overlap_loss = overlap_loss or config.get("iou_loss", "iou")
-    if overlap_loss_multiplier is None:
-        overlap_loss_multiplier = config.get("iou_normalizer", 1.0)
-    if class_loss_multiplier is None:
-        class_loss_multiplier = config.get("cls_normalizer", 1.0)
-    if confidence_loss_multiplier is None:
-        confidence_loss_multiplier = config.get("obj_normalizer", 1.0)
-    loss_func = LossFunction(
-        overlap_loss, predict_overlap, overlap_loss_multiplier, class_loss_multiplier, confidence_loss_multiplier
-    )
-
-    if sum(var is not None for var in (match_sim_ota, match_size_ratio, match_iou_threshold)) > 1:
-        raise ValueError("More than one matching algorithm specified.")
-    if match_sim_ota:
-        sim_ota_loss_func = LossFunction(
-            overlap_loss, None, overlap_loss_multiplier, class_loss_multiplier, confidence_loss_multiplier
-        )
-        matching_func = SimOTAMatching(sim_ota_loss_func)
-    elif match_size_ratio is not None:
-        matching_func = SizeRatioMatching(match_size_ratio, anchor_dims, anchor_ids, ignore_iou_threshold)
-    elif match_iou_threshold is not None:
-        matching_func = IoUThresholdMatching(match_iou_threshold, anchor_dims, anchor_ids, ignore_iou_threshold)
-    else:
-        matching_func = HighestIoUMatching(anchor_dims, anchor_ids, ignore_iou_threshold)
-
-    layer = yolo_layers.DetectionLayer(
-        num_classes=config["classes"],
-        anchor_dims=[anchor_dims[i] for i in anchor_ids],
-        matching_func=matching_func,
-        loss_func=loss_func,
-        xy_scale=xy_scale,
-        input_is_normalized=input_is_normalized,
-    )
-
-    return layer, num_inputs[-1]
diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
new file mode 100644
index 0000000000..7b7da45766
--- /dev/null
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -0,0 +1,384 @@
+import io
+import re
+from collections import OrderedDict
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from warnings import warn
+
+import numpy as np
+import torch
+import torch.nn as nn
+from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from torch import Tensor
+
+from pl_bolts.models.detection.yolo import yolo_layers
+from pl_bolts.models.detection.yolo.utils import get_image_size
+
+
+class DarknetNetwork(nn.Module):
+    """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation."""
+
+    def __init__(self, config_path: str, weights_path: Optional[str] = None, **kwargs) -> None:
+        """Parses a Darknet configuration file and creates the network structure.
+
+        Iterates through the layers from the configuration and creates corresponding PyTorch modules. If
+        ``weights_path`` is given and points to a Darknet model file, loads the convolutional layer weights from the
+        file.
+
+        Args:
+            config_path: Path to a Darknet configuration file that defines the network architecture.
+            weights_path: Path to a Darknet model file. If given, the model weights will be read from this file.
+            match_sim_ota: If ``True``, matches a target to an anchor using the SimOTA algorithm from YOLOX.
+            match_size_ratio: If specified, matches a target to an anchor if its width and height relative to the anchor
+                is smaller than this ratio. If ``match_size_ratio`` or ``match_iou_threshold`` is not specified, selects
+                for each target the anchor with the highest IoU.
+            match_iou_threshold: If specified, matches a target to an anchor if the IoU is higher than this threshold.
+            ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has
+                IoU with some target greater than this threshold, the predictor will not be taken into account when
+                calculating the confidence loss.
+            overlap_func: Which function to use for calculating the overlap between boxes. Valid values are "iou",
+                "giou", "diou", and "ciou".
+            predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+                confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+                ``overlap_func``.
+            overlap_loss_multiplier: Overlap loss will be scaled by this value.
+            confidence_loss_multiplier: Confidence loss will be scaled by this value.
+            class_loss_multiplier: Classification loss will be scaled by this value.
+        """
+        super().__init__()
+
+        with open(config_path) as config_file:
+            sections = self._read_config(config_file)
+
+        if len(sections) < 2:
+            raise MisconfigurationException("The model configuration file should include at least two sections.")
+
+        self.__dict__.update(sections[0])
+        global_config = sections[0]
+        layer_configs = sections[1:]
+
+        self.layers = nn.ModuleList()
+        # num_inputs will contain the number of channels in the input of every layer up to the current layer. It is
+        # initialized with the number of channels in the input image.
+        num_inputs = [global_config.get("channels", 3)]
+        for layer_config in layer_configs:
+            config = {**global_config, **layer_config}
+            module, num_outputs = _create_layer(config, num_inputs, **kwargs)
+            self.layers.append(module)
+            num_inputs.append(num_outputs)
+
+        if weights_path is not None:
+            with open(weights_path) as weight_file:
+                self.load_weights(weight_file)
+
+    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+        outputs = []  # Outputs from all layers
+        detections = []  # Outputs from detection layers
+        losses = []  # Losses from detection layers
+        hits = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        for layer in self.layers:
+            if isinstance(layer, (yolo_layers.RouteLayer, yolo_layers.ShortcutLayer)):
+                x = layer(x, outputs)
+            elif isinstance(layer, yolo_layers.DetectionLayer):
+                x = layer(x, image_size, targets)
+                detections.append(x)
+                if targets is not None:
+                    losses.append(layer.losses)
+                    hits.append(layer.hits)
+            else:
+                x = layer(x)
+
+            outputs.append(x)
+
+        return detections, losses, hits
+
+    def load_weights(self, weight_file: io.IOBase):
+        """Loads weights to layer modules from a pretrained Darknet model.
+
+        One may want to continue training from pretrained weights, on a dataset with a different number of object
+        categories. The number of kernels in the convolutional layers just before each detection layer depends on the
+        number of output classes. The Darknet solution is to truncate the weight file and stop reading weights at the
+        first incompatible layer. For this reason the function silently leaves the rest of the layers unchanged, when
+        the weight file ends.
+
+        Args:
+            weight_file: A file-like object containing model weights in the Darknet binary format.
+        """
+        if not isinstance(weight_file, io.IOBase):
+            raise ValueError("weight_file must be a file-like object.")
+
+        version = np.fromfile(weight_file, count=3, dtype=np.int32)
+        images_seen = np.fromfile(weight_file, count=1, dtype=np.int64)
+        rank_zero_info(
+            f"Loading weights from Darknet model version {version[0]}.{version[1]}.{version[2]} "
+            f"that has been trained on {images_seen[0]} images."
+        )
+
+        def read(tensor):
+            """Reads the contents of ``tensor`` from the current position of ``weight_file``.
+
+            If there's no more data in ``weight_file``, returns without error.
+            """
+            x = np.fromfile(weight_file, count=tensor.numel(), dtype=np.float32)
+            if x.size > 0:
+                x = torch.from_numpy(x).view_as(tensor)
+                with torch.no_grad():
+                    tensor.copy_(x)
+            return x.size
+
+        for layer_idx, layer in enumerate(self.layers):
+            # Weights are loaded only to convolutional layers
+            if not isinstance(layer, yolo_layers.Conv):
+                continue
+
+            rank_zero_debug(f"Reading weights for layer {layer_idx}: {list(layer.conv.weight.shape)}")
+
+            # If convolution is followed by batch normalization, read the batch normalization parameters. Otherwise we
+            # read the convolution bias.
+            if isinstance(layer.norm, nn.Identity):
+                read(layer.conv.bias)
+            else:
+                read(layer.norm.bias)
+                read(layer.norm.weight)
+                read(layer.norm.running_mean)
+                read(layer.norm.running_var)
+
+            read_count = read(layer.conv.weight)
+            if read_count == 0:
+                return
+
+    def _read_config(self, config_file: Iterable[str]) -> List[Dict[str, Any]]:
+        """Reads a YOLOv4 network configuration file and returns a list of configuration sections.
+
+        Args:
+            config_file: The configuration file to read.
+
+        Returns:
+            A list of configuration sections.
+        """
+        section_re = re.compile(r"\[([^]]+)\]")
+        list_variables = ("layers", "anchors", "mask", "scales")
+        variable_types = {
+            "activation": str,
+            "anchors": int,
+            "angle": float,
+            "batch": int,
+            "batch_normalize": bool,
+            "beta_nms": float,
+            "burn_in": int,
+            "channels": int,
+            "classes": int,
+            "cls_normalizer": float,
+            "decay": float,
+            "exposure": float,
+            "filters": int,
+            "from": int,
+            "groups": int,
+            "group_id": int,
+            "height": int,
+            "hue": float,
+            "ignore_thresh": float,
+            "iou_loss": str,
+            "iou_normalizer": float,
+            "iou_thresh": float,
+            "jitter": float,
+            "layers": int,
+            "learning_rate": float,
+            "mask": int,
+            "max_batches": int,
+            "max_delta": float,
+            "momentum": float,
+            "mosaic": bool,
+            "new_coords": int,
+            "nms_kind": str,
+            "num": int,
+            "obj_normalizer": float,
+            "pad": bool,
+            "policy": str,
+            "random": bool,
+            "resize": float,
+            "saturation": float,
+            "scales": float,
+            "scale_x_y": float,
+            "size": int,
+            "steps": str,
+            "stride": int,
+            "subdivisions": int,
+            "truth_thresh": float,
+            "width": int,
+        }
+
+        section = None
+        sections = []
+
+        def convert(key, value):
+            """Converts a value to the correct type based on key."""
+            if key not in variable_types:
+                warn("Unknown YOLO configuration variable: " + key)
+                return key, value
+            if key in list_variables:
+                value = [variable_types[key](v) for v in value.split(",")]
+            else:
+                value = variable_types[key](value)
+            return key, value
+
+        for line in config_file:
+            line = line.strip()
+            if (not line) or (line[0] == "#"):
+                continue
+
+            section_match = section_re.match(line)
+            if section_match:
+                if section is not None:
+                    sections.append(section)
+                section = {"type": section_match.group(1)}
+            else:
+                key, value = line.split("=")
+                key = key.rstrip()
+                value = value.lstrip()
+                key, value = convert(key, value)
+                section[key] = value
+        if section is not None:
+            sections.append(section)
+
+        return sections
+
+
+def _create_layer(config: Dict[str, Any], num_inputs: List[int], **kwargs) -> Tuple[nn.Module, int]:
+    """Calls one of the ``_create_<layertype>(config, num_inputs)`` functions to create a PyTorch module from the
+    layer config.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
+    create_func = {
+        "convolutional": _create_convolutional,
+        "maxpool": _create_maxpool,
+        "route": _create_route,
+        "shortcut": _create_shortcut,
+        "upsample": _create_upsample,
+        "yolo": _create_yolo,
+    }
+    return create_func[config["type"]](config, num_inputs, **kwargs)
+
+
+def _create_convolutional(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+    batch_normalize = config.get("batch_normalize", False)
+    padding = (config["size"] - 1) // 2 if config["pad"] else 0
+
+    layer = yolo_layers.Conv(
+        num_inputs[-1],
+        config["filters"],
+        kernel_size=config["size"],
+        stride=config["stride"],
+        padding=padding,
+        bias=not batch_normalize,
+        activation=config["activation"],
+        norm="batchnorm" if batch_normalize else None,
+    )
+    return layer, config["filters"]
+
+
+def _create_maxpool(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+    """Creates a max pooling layer.
+
+    Padding is added so that the output resolution will be the input resolution divided by stride, rounded upwards.
+    """
+    kernel_size = config["size"]
+    padding = (kernel_size - 1) // 2
+    maxpool = nn.MaxPool2d(kernel_size, config["stride"], padding)
+    if kernel_size % 2 == 1:
+        return maxpool, num_inputs[-1]
+
+    # If the kernel size is an even number, we need one cell of extra padding, on top of the padding added by MaxPool2d
+    # on both sides.
+    layer = nn.Sequential(
+        OrderedDict(
+            [
+                ("pad", nn.ZeroPad2d((0, 1, 0, 1))),
+                ("maxpool", maxpool),
+            ]
+        )
+    )
+    return layer, num_inputs[-1]
+
+
+def _create_route(config, num_inputs: List[int], **kwargs):
+    num_chunks = config.get("groups", 1)
+    chunk_idx = config.get("group_id", 0)
+
+    # 0 is the first layer, -1 is the previous layer
+    last = len(num_inputs) - 1
+    source_layers = [layer if layer >= 0 else last + layer for layer in config["layers"]]
+
+    layer = yolo_layers.RouteLayer(source_layers, num_chunks, chunk_idx)
+
+    # The number of outputs of a source layer is the number of inputs of the next layer.
+    num_outputs = sum(num_inputs[layer + 1] // num_chunks for layer in source_layers)
+
+    return layer, num_outputs
+
+
+def _create_shortcut(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+    layer = yolo_layers.ShortcutLayer(config["from"])
+    return layer, num_inputs[-1]
+
+
+def _create_upsample(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+    layer = nn.Upsample(scale_factor=config["stride"], mode="nearest")
+    return layer, num_inputs[-1]
+
+
+def _create_yolo(
+    config: Dict[str, Any],
+    num_inputs: List[int],
+    prior_shapes: Optional[List[Tuple[int, int]]] = None,
+    matching_algorithm: Optional[str] = None,
+    matching_threshold: Optional[float] = None,
+    ignore_bg_threshold: Optional[float] = None,
+    overlap_func: Optional[Union[str, Callable]] = None,
+    predict_overlap: Optional[float] = None,
+    overlap_loss_multiplier: Optional[float] = None,
+    confidence_loss_multiplier: Optional[float] = None,
+    class_loss_multiplier: Optional[float] = None,
+    **kwargs,
+):
+    if prior_shapes is None:
+        # The "anchors" list alternates width and height.
+        prior_shapes = config["anchors"]
+        prior_shapes = [(prior_shapes[i], prior_shapes[i + 1]) for i in range(0, len(prior_shapes), 2)]
+    if ignore_bg_threshold is None:
+        ignore_bg_threshold = config.get("ignore_thresh", 1.0)
+    if overlap_func is None:
+        overlap_func = config.get("iou_loss", "iou")
+    if overlap_loss_multiplier is None:
+        overlap_loss_multiplier = config.get("iou_normalizer", 1.0)
+    if confidence_loss_multiplier is None:
+        confidence_loss_multiplier = config.get("obj_normalizer", 1.0)
+    if class_loss_multiplier is None:
+        class_loss_multiplier = config.get("cls_normalizer", 1.0)
+
+    layer = yolo_layers.create_detection_layer(
+        num_classes=config["classes"],
+        prior_shapes=prior_shapes,
+        prior_shape_idxs=config["mask"],
+        matching_algorithm=matching_algorithm,
+        matching_threshold=matching_threshold,
+        ignore_bg_threshold=ignore_bg_threshold,
+        overlap_func=overlap_func,
+        predict_overlap=predict_overlap,
+        overlap_loss_multiplier=overlap_loss_multiplier,
+        confidence_loss_multiplier=confidence_loss_multiplier,
+        class_loss_multiplier=class_loss_multiplier,
+        xy_scale=config.get("scale_x_y", 1.0),
+        input_is_normalized=config.get("new_coords", 0) > 0,
+    )
+    return layer, None
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index 483729770f..bcbd6fad7c 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -21,26 +21,18 @@ class ShapeMatching(ABC):
 
     Most YOLO variants match targets to anchors based on prior shapes that are assigned to the anchors in the model
     configuration. The subclasses of ``ShapeMatching`` implement matching rules that compare the width and height of
-    the targets to each prior shape, regardless of the grid cell where the target is. When the model includes multiple
+    the targets to each prior shape (regardless of the grid cell where the target is). When the model includes multiple
     detection layers, different shapes are defined for each layer. Usually there are three detection layers and three
     prior shapes per layer.
 
     Args:
-        anchor_dims: A list of all the prior shapes. The list should contain (width, height) tuples in the network input
-            resolution (relative to the width and height defined in the configuration file).
-        anchor_ids: List of indices to ``anchor_dims`` that is used to select the (usually 3) prior shapes that this
-            layer uses.
-        ignore_iou_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
     """
 
-    def __init__(self, anchor_dims: List[Tuple[int, int]], anchor_ids: List[int], ignore_iou_threshold: float = 0.7):
-        self.anchor_dims = anchor_dims
-        # anchor_map maps the anchor indices to predictors in this layer, or to -1 if it's not an anchor of this layer.
-        # This layer ignores the target if all the selected anchors are in another layer.
-        self.anchor_map = [anchor_ids.index(i) if i in anchor_ids else -1 for i in range(len(anchor_dims))]
-        self.ignore_iou_threshold = ignore_iou_threshold
+    def __init__(self, ignore_bg_threshold: float = 0.7):
+        self.ignore_bg_threshold = ignore_bg_threshold
 
     def __call__(
         self,
@@ -82,7 +74,7 @@ def __call__(
         # Background mask is used to select predictors that are not responsible for predicting any object, for
         # calculating the part of the confidence loss with zero as the target confidence. It is set to False, if a
         # predicted box overlaps any target significantly, or if a prediction is matched to a target.
-        background_mask = iou_below(preds["boxes"], targets["boxes"], self.ignore_iou_threshold)
+        background_mask = iou_below(preds["boxes"], targets["boxes"], self.ignore_bg_threshold)
         background_mask[cell_j, cell_i, matched_predictors] = False
 
         preds = {
@@ -105,8 +97,8 @@ def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
             wh: A matrix of predicted width and height values.
 
         Returns:
-            matched_targets, matched_predictors: A vector that can be used to select the targets that this layer
-            matched and a vector that lists the matching predictors within the grid cell.
+            matched_targets, matched_anchors: Two vectors or a `2xN` matrix. The first vector is used to select the
+            targets that this layer matched and the second one lists the matching anchors within the grid cell.
         """
         pass
 
@@ -115,13 +107,33 @@ class HighestIoUMatching(ShapeMatching):
     """For each target, select the prior shape that gives the highest IoU.
 
     This is the original YOLO matching rule.
+
+    Args:
+        prior_shapes: A list of all the prior box dimensions. The list should contain (width, height) tuples in the
+            network input resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
     """
 
+    def __init__(
+        self, prior_shapes: List[Tuple[int, int]], prior_shape_idxs: List[int], ignore_bg_threshold: float = 0.7
+    ):
+        super().__init__(ignore_bg_threshold)
+        self.prior_shapes = prior_shapes
+        # anchor_map maps the anchor indices to predictors in this layer, or to -1 if it's not an anchor of this layer.
+        # This layer ignores the target if all the selected anchors are in another layer.
+        self.anchor_map = [
+            prior_shape_idxs.index(idx) if idx in prior_shape_idxs else -1 for idx in range(len(prior_shapes))
+        ]
+
     def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
-        anchor_wh = torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
+        prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
         anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=wh.device)
 
-        ious = aligned_iou(wh, anchor_wh)
+        ious = aligned_iou(wh, prior_wh)
         highest_iou_anchors = ious.max(1).indices
         highest_iou_anchors = anchor_map[highest_iou_anchors]
         matched_targets = highest_iou_anchors >= 0
@@ -133,33 +145,33 @@ class IoUThresholdMatching(ShapeMatching):
     """For each target, select all prior shapes that give a high enough IoU.
 
     Args:
+        prior_shapes: A list of all the prior box dimensions. The list should contain (width, height) tuples in the
+            network input resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
         threshold: IoU treshold for matching.
-        anchor_dims: A list of all the prior shapes. The list should contain (width, height) tuples in the network input
-            resolution (relative to the width and height defined in the configuration file).
-        anchor_ids: List of indices to ``anchor_dims`` that is used to select the (usually 3) prior shapes that this
-            layer uses.
-        ignore_iou_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
     """
 
-    def __init__(self, threshold, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(
+        self,
+        prior_shapes: List[Tuple[int, int]],
+        prior_shape_idxs: List[int],
+        threshold: float,
+        ignore_bg_threshold: float = 0.7,
+    ):
+        super().__init__(ignore_bg_threshold)
+        self.prior_shapes = [prior_shapes[idx] for idx in prior_shape_idxs]
         self.threshold = threshold
 
     def match(self, wh):
-        anchor_wh = torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
-        anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=wh.device)
+        prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
 
-        ious = aligned_iou(wh, anchor_wh)
+        ious = aligned_iou(wh, prior_wh)
         above_threshold = (ious > self.threshold).nonzero()
-        targets_above_threshold = above_threshold[:, 0]
-        anchors_above_threshold = above_threshold[:, 1]
-        anchors_above_threshold = anchor_map[anchors_above_threshold]
-        local = anchors_above_threshold >= 0
-        matched_targets = targets_above_threshold[local]
-        matched_anchors = anchors_above_threshold[local]
-        return matched_targets, matched_anchors
+        return above_threshold.T
 
 
 class SizeRatioMatching(ShapeMatching):
@@ -169,35 +181,35 @@ class SizeRatioMatching(ShapeMatching):
     This is the matching rule used by Ultralytics YOLOv5 implementation.
 
     Args:
+        prior_shapes: A list of all the prior box dimensions. The list should contain (width, height) tuples in the
+            network input resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
         threshold: Size ratio threshold for matching.
-        anchor_dims: A list of all the prior shapes. The list should contain (width, height) tuples in the network input
-            resolution (relative to the width and height defined in the configuration file).
-        anchor_ids: List of indices to ``anchor_dims`` that is used to select the (usually 3) prior shapes that this
-            layer uses.
-        ignore_iou_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
     """
 
-    def __init__(self, threshold, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(
+        self,
+        prior_shapes: List[Tuple[int, int]],
+        prior_shape_idxs: List[int],
+        threshold: float,
+        ignore_bg_threshold: float = 0.7,
+    ):
+        super().__init__(ignore_bg_threshold)
+        self.prior_shapes = [prior_shapes[idx] for idx in prior_shape_idxs]
         self.threshold = threshold
 
     def match(self, wh):
-        anchor_wh = torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
-        anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=wh.device)
+        prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
 
-        wh_ratio = wh[:, None, :] / anchor_wh[None, :, :]  # [num_targets, num_anchors, 2]
+        wh_ratio = wh[:, None, :] / prior_wh[None, :, :]  # [num_targets, num_anchors, 2]
         wh_ratio = torch.max(wh_ratio, 1.0 / wh_ratio)
         wh_ratio = wh_ratio.max(2).values  # [num_targets, num_anchors]
         below_threshold = (wh_ratio < self.threshold).nonzero()
-        targets_below_threshold = below_threshold[:, 0]
-        anchors_below_threshold = below_threshold[:, 1]
-        anchors_below_threshold = anchor_map[anchors_below_threshold]
-        local = anchors_below_threshold >= 0
-        matched_targets = targets_below_threshold[local]
-        matched_anchors = anchors_below_threshold[local]
-        return matched_targets, matched_anchors
+        return below_threshold.T
 
 
 def _sim_ota_match(costs, ious):
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
new file mode 100644
index 0000000000..4065ae7f25
--- /dev/null
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -0,0 +1,801 @@
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from pl_bolts.models.detection.yolo.utils import get_image_size
+from pl_bolts.models.detection.yolo.yolo_layers import Conv, create_detection_layer
+
+
+class Bottleneck(nn.Module):
+    """A bottleneck from YOLOv5.
+
+    Args:
+        in_channels: Number of input channels that the bottleneck expects.
+        out_channels: Number of output channels that the bottleneck produces.
+        shortcut: Whether the bottleneck should include a shortcut connection.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut: bool = True,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+        self.convs = nn.Sequential(
+            Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm),
+            Conv(out_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=norm),
+        )
+        self.shortcut = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.convs(x)
+        return x + y if self.shortcut else y
+
+
+class TinyStage(nn.Module):
+    """One stage of the "tiny" network architecture from YOLOv4.
+
+    Args:
+        num_channels: Number of channels in the stage input and output.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        num_channels: int,
+        activation: Optional[str] = "leaky",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+
+        hidden_channels = num_channels // 2
+        self.conv1 = Conv(hidden_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
+        self.conv2 = Conv(hidden_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
+        self.mix = Conv(num_channels, num_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+    def forward(self, x):
+        x = torch.chunk(x, 2, dim=1)[1]
+        y1 = self.conv1(x)
+        y2 = self.conv2(y1)
+        return self.mix(torch.cat((y2, y1), dim=1))
+
+
+class CSPStage(nn.Module):
+    """One stage of a Cross Stage Partial Network (CSPNet).
+
+    Args:
+        in_channels: Number of input channels that the stage expects.
+        out_channels: Number of output channels that the stage produces.
+        depth: Number of bottlenecks that the stage contains.
+        shortcut: Whether the bottlenecks should include a shortcut connection.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        depth: int = 1,
+        shortcut: bool = True,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+        # Instead of splitting the N output channels of a convolution into two parts, we can equivalently perform two
+        # convolutions with N/2 output channels.
+        self.split1 = Conv(in_channels, out_channels // 2, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.split2 = Conv(in_channels, out_channels // 2, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.mix = Conv(out_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.bottlenecks = nn.Sequential(
+            *(
+                Bottleneck(out_channels // 2, out_channels // 2, shortcut, norm=norm, activation=activation)
+                for _ in range(depth)
+            )
+        )
+
+    def forward(self, x):
+        y1 = self.bottlenecks(self.split1(x))
+        y2 = self.split2(x)
+        return self.mix(torch.cat((y1, y2), dim=1))
+
+
+class FastSPP(nn.Module):
+    """Fast spatial pyramid pooling module.
+
+    Args:
+        in_channels: Number of input channels that the module expects.
+        out_channels: Number of output channels that the module produces.
+        kernel_size: Kernel size for convolutional layers.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 5,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
+        self.mix = Conv(hidden_channels * 4, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+    def forward(self, x):
+        y1 = self.conv(x)
+        y2 = self.maxpool(y1)
+        y3 = self.maxpool(y2)
+        y4 = self.maxpool(y3)
+        return self.mix(torch.cat((y1, y2, y3, y4), dim=1))
+
+
+class TinyBackbone(nn.Module):
+    """Backbone of the "tiny" network architecture from YOLOv4.
+
+    Args:
+        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
+            number of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        width: int = 32,
+        activation: Optional[str] = "leaky",
+        normalization: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+
+        def smooth(num_channels):
+            return Conv(num_channels, num_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+
+        def downsample(in_channels, out_channels):
+            conv = Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("conv", conv),
+                        ("smooth", smooth(out_channels)),
+                    ]
+                )
+            )
+
+        def maxpool(out_channels):
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("pad", nn.ZeroPad2d((0, 1, 0, 1))),
+                        ("maxpool", nn.MaxPool2d(kernel_size=2, stride=2, padding=0)),
+                        ("smooth", smooth(out_channels)),
+                    ]
+                )
+            )
+
+        self.stage1 = Conv(3, width, kernel_size=3, stride=2, activation=activation, norm=normalization)
+        self.downsample2 = downsample(width, width * 2)
+        self.stage2 = TinyStage(width * 2, activation=activation, norm=normalization)
+        self.downsample3 = maxpool(width * 4)
+        self.stage3 = TinyStage(width * 4, activation=activation, norm=normalization)
+        self.downsample4 = maxpool(width * 8)
+        self.stage4 = TinyStage(width * 8, activation=activation, norm=normalization)
+        self.downsample5 = maxpool(width * 16)
+
+    def forward(self, x):
+        c1 = self.stage1(x)
+        x = self.downsample2(c1)
+        c2 = self.stage2(x)
+        x = torch.cat((x, c2), dim=1)
+        x = self.downsample3(x)
+        c3 = self.stage3(x)
+        x = torch.cat((x, c3), dim=1)
+        x = self.downsample4(x)
+        c4 = self.stage4(x)
+        x = torch.cat((x, c4), dim=1)
+        c5 = self.downsample5(x)
+        return c1, c2, c3, c4, c5
+
+
+class CSPBackbone(nn.Module):
+    """The Cross Stage Partial Network backbone from YOLOv5.
+
+    Args:
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
+        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
+            number of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        depth: int = 3,
+        width: int = 64,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def downsample(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def csp(num_channels, depth):
+            return CSPStage(num_channels, num_channels, depth=depth)
+
+        def spp(num_channels):
+            return FastSPP(num_channels, num_channels, kernel_size=5, activation=activation, norm=normalization)
+
+        self.stage1 = Conv(3, width, kernel_size=6, stride=2, padding=2, activation=activation, norm=normalization)
+        self.stage2 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("downsample", downsample(width, width * 2)),
+                    ("csp", csp(width * 2, depth)),
+                ]
+            )
+        )
+        self.stage3 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("downsample", downsample(width * 2, width * 4)),
+                    ("csp", csp(width * 4, depth * 2)),
+                ]
+            )
+        )
+        self.stage4 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("downsample", downsample(width * 4, width * 8)),
+                    ("csp", csp(width * 8, depth * 3)),
+                ]
+            )
+        )
+        self.stage5 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("downsample", downsample(width * 8, width * 16)),
+                    ("csp", csp(width * 16, depth)),
+                    ("spp", spp(width * 16)),
+                ]
+            )
+        )
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+        c1 = self.stage1(x)
+        c2 = self.stage2(c1)
+        c3 = self.stage3(c2)
+        c4 = self.stage4(c3)
+        c5 = self.stage5(c4)
+        return c1, c2, c3, c4, c5
+
+
+class YOLOV4TinyNetwork(nn.Module):
+    """The "tiny" network architecture from YOLOv4.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
+            number of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain (width, height) tuples in the network input
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per grid cell. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
+            "iou", "giou", "diou", and "ciou" (default).
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        width: int = 32,
+        activation: Optional[str] = "leaky",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: List[Tuple[int, int]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                (12, 16),
+                (19, 36),
+                (40, 28),
+                (36, 75),
+                (76, 55),
+                (72, 146),
+                (142, 110),
+                (192, 243),
+                (459, 401),
+            ]
+            anchors_per_cell = 3
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 3)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def conv1x1(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def conv3x3(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+
+        def linear(in_channels, out_channels):
+            return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)
+
+        def detect(prior_shape_idxs):
+            return create_detection_layer(
+                prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
+            )
+
+        self.backbone = backbone or TinyBackbone(width=width, normalization=normalization, activation=activation)
+
+        self.lateral5 = conv1x1(width * 16, width * 8)
+        self.out5 = nn.Sequential(
+            conv3x3(width * 8, width * 16),
+            linear(width * 16, num_outputs),
+        )
+        self.upsample5 = nn.Sequential(
+            conv1x1(width * 8, width * 4),
+            nn.Upsample(scale_factor=2, mode="nearest"),
+        )
+
+        self.lateral4 = conv3x3(width * 12, width * 8)
+        self.out4 = linear(width * 8, num_outputs)
+        self.upsample4 = nn.Sequential(
+            conv1x1(width * 8, width * 2),
+            nn.Upsample(scale_factor=2, mode="nearest"),
+        )
+
+        self.lateral3 = conv3x3(width * 6, width * 4)
+        self.out3 = linear(width * 4, num_outputs)
+
+        self.detect3 = detect([0, 1, 2])
+        self.detect4 = detect([3, 4, 5])
+        self.detect5 = detect([6, 7, 8])
+
+    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+        detections = []  # Outputs from detection layers
+        losses = []  # Losses from detection layers
+        hits = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5 = self.backbone(x)[-3:]
+
+        p5 = self.lateral5(c5)
+        x = torch.cat((self.upsample5(p5), c4), dim=1)
+        p4 = self.lateral4(x)
+        x = torch.cat((self.upsample4(p4), c3), dim=1)
+        p3 = self.lateral3(x)
+
+        y = self.detect5(self.out5(p5), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect5.losses)
+            hits.append(self.detect5.hits)
+
+        y = self.detect4(self.out4(p4), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect4.losses)
+            hits.append(self.detect4.hits)
+
+        y = self.detect3(self.out3(p3), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect3.losses)
+            hits.append(self.detect3.hits)
+
+        return detections, losses, hits
+
+
+class YOLOV5Network(nn.Module):
+    """The YOLOv5 network architecture. Different variants (n/s/m/l/x) can be achieved by adjusting the ``depth``
+    and ``width`` parameters.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
+        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
+            number of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain (width, height) tuples in the network input
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per grid cell. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
+            "iou", "giou", "diou", and "ciou" (default).
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        depth: int = 3,
+        width: int = 64,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: List[Tuple[int, int]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                (12, 16),
+                (19, 36),
+                (40, 28),
+                (36, 75),
+                (76, 55),
+                (72, 146),
+                (142, 110),
+                (192, 243),
+                (459, 401),
+            ]
+            anchors_per_cell = 3
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 3)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def downsample(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def conv1x1(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def linear(in_channels, out_channels):
+            return nn.Conv2d(in_channels, out_channels, kernel_size=1)
+
+        def bottleneck(in_channels, out_channels):
+            return CSPStage(
+                in_channels,
+                out_channels,
+                depth=depth,
+                shortcut=False,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def detect(prior_shape_idxs):
+            return create_detection_layer(
+                prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
+            )
+
+        self.backbone = backbone or CSPBackbone(
+            depth=depth, width=width, normalization=normalization, activation=activation
+        )
+
+        self.lateral3 = bottleneck(width * 8, width * 4)
+        self.out3 = linear(width * 4, num_outputs)
+
+        self.lateral4a = nn.Sequential(
+            bottleneck(width * 16, width * 8),
+            conv1x1(width * 8, width * 4),
+        )
+        self.lateral4b = bottleneck(width * 8, width * 8)
+        self.out4 = linear(width * 8, num_outputs)
+
+        self.lateral5a = conv1x1(width * 16, width * 8)
+        self.lateral5b = bottleneck(width * 16, width * 16)
+        self.out5 = linear(width * 16, num_outputs)
+
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+        self.downsample3 = downsample(width * 4, width * 4)
+        self.downsample4 = downsample(width * 8, width * 8)
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+
+    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+        detections = []  # Outputs from detection layers
+        losses = []  # Losses from detection layers
+        hits = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5 = self.backbone(x)[-3:]
+
+        p5 = self.lateral5a(c5)
+        x = torch.cat((self.upsample(p5), c4), dim=1)
+        p4 = self.lateral4a(x)
+        x = torch.cat((self.upsample(p4), c3), dim=1)
+
+        n3 = self.lateral3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.lateral4b(x)
+        x = torch.cat((self.downsample4(n4), p5), dim=1)
+        n5 = self.lateral5b(x)
+
+        y = self.detect3(self.out3(n3), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect3.losses)
+            hits.append(self.detect3.hits)
+
+        y = self.detect4(self.out4(n4), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect4.losses)
+            hits.append(self.detect4.hits)
+
+        y = self.detect5(self.out5(n5), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect5.losses)
+            hits.append(self.detect5.hits)
+
+        return detections, losses, hits
+
+
+class YOLOXNetwork(nn.Module):
+    """The YOLOX network architecture. Different variants (nano/tiny/s/m/l/x) can be achieved by adjusting the
+    ``depth`` and ``width`` parameters.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
+        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
+            number of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain (width, height) tuples in the network input
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per grid cell. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
+            "iou", "giou", "diou", and "ciou" (default).
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        depth: int = 3,
+        width: int = 64,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: List[Tuple[int, int]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        # By default use one anchor per cell and the stride as the prior size.
+        if prior_shapes is None:
+            prior_shapes = [(8, 8), (16, 16), (32, 32)]
+            anchors_per_cell = 1
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 3)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
+
+        def downsample(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def conv1x1(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def conv3x3(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+
+        def linear(in_channels, out_channels):
+            return nn.Conv2d(in_channels, out_channels, kernel_size=1)
+
+        def bottleneck(in_channels, out_channels):
+            return CSPStage(
+                in_channels,
+                out_channels,
+                depth=depth,
+                shortcut=False,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def detect(prior_shape_idxs):
+            return create_detection_layer(
+                prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
+            )
+
+        self.backbone = backbone or CSPBackbone(
+            depth=depth, width=width, normalization=normalization, activation=activation
+        )
+
+        self.lateral3 = bottleneck(width * 8, width * 4)
+        self.out3_stem = conv1x1(width * 4, width * 4)
+        self.out3_feat = nn.Sequential(
+            conv3x3(width * 4, width * 4),
+            conv3x3(width * 4, width * 4),
+        )
+        self.out3_box = linear(width * 4, anchors_per_cell * 4)
+        self.out3_confidence = linear(width * 4, anchors_per_cell)
+        self.out3_classprob = nn.Sequential(
+            conv3x3(width * 4, width * 4),
+            conv3x3(width * 4, width * 4),
+            linear(width * 4, anchors_per_cell * num_classes),
+        )
+
+        self.lateral4a = nn.Sequential(
+            bottleneck(width * 16, width * 8),
+            conv1x1(width * 8, width * 4),
+        )
+        self.lateral4b = bottleneck(width * 8, width * 8)
+        self.out4_stem = conv1x1(width * 8, width * 4)
+        self.out4_feat = nn.Sequential(
+            conv3x3(width * 4, width * 4),
+            conv3x3(width * 4, width * 4),
+        )
+        self.out4_box = linear(width * 4, anchors_per_cell * 4)
+        self.out4_confidence = linear(width * 4, anchors_per_cell)
+        self.out4_classprob = nn.Sequential(
+            conv3x3(width * 4, width * 4),
+            conv3x3(width * 4, width * 4),
+            linear(width * 4, anchors_per_cell * num_classes),
+        )
+
+        self.lateral5a = conv1x1(width * 16, width * 8)
+        self.lateral5b = bottleneck(width * 16, width * 16)
+        self.out5_stem = conv1x1(width * 16, width * 4)
+        self.out5_feat = nn.Sequential(
+            conv3x3(width * 4, width * 4),
+            conv3x3(width * 4, width * 4),
+        )
+        self.out5_box = linear(width * 4, anchors_per_cell * 4)
+        self.out5_confidence = linear(width * 4, anchors_per_cell)
+        self.out5_classprob = nn.Sequential(
+            conv3x3(width * 4, width * 4),
+            conv3x3(width * 4, width * 4),
+            linear(width * 4, anchors_per_cell * num_classes),
+        )
+
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+        self.downsample3 = downsample(width * 4, width * 4)
+        self.downsample4 = downsample(width * 8, width * 8)
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+
+    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+        detections = []  # Outputs from detection layers
+        losses = []  # Losses from detection layers
+        hits = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5 = self.backbone(x)[-3:]
+
+        p5 = self.lateral5a(c5)
+        x = torch.cat((self.upsample(p5), c4), dim=1)
+        p4 = self.lateral4a(x)
+        x = torch.cat((self.upsample(p4), c3), dim=1)
+
+        n3 = self.lateral3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.lateral4b(x)
+        x = torch.cat((self.downsample4(n4), p5), dim=1)
+        n5 = self.lateral5b(x)
+
+        x = self.out3_stem(n3)
+        features = self.out3_feat(x)
+        box = self.out3_box(features)
+        confidence = self.out3_confidence(features)
+        classprob = self.out3_classprob(x)
+        y = self.detect3(torch.cat((box, confidence, classprob), dim=1), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect3.losses)
+            hits.append(self.detect3.hits)
+
+        x = self.out4_stem(n4)
+        features = self.out4_feat(x)
+        box = self.out4_box(features)
+        confidence = self.out4_confidence(features)
+        classprob = self.out4_classprob(x)
+        y = self.detect4(torch.cat((box, confidence, classprob), dim=1), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect4.losses)
+            hits.append(self.detect4.hits)
+
+        x = self.out5_stem(n5)
+        features = self.out5_feat(x)
+        box = self.out5_box(features)
+        confidence = self.out5_confidence(features)
+        classprob = self.out5_classprob(x)
+        y = self.detect5(torch.cat((box, confidence, classprob), dim=1), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect5.losses)
+            hits.append(self.detect5.hits)
+
+        return detections, losses, hits
diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
index 7cb5e615a3..2ceb1313bb 100644
--- a/pl_bolts/models/detection/yolo/utils.py
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -12,12 +12,14 @@
 else:
     warn_missing_pkg("torchvision")
 
-
 # PyTorch 1.10 introduced the argument "indexing" and deprecated calling without the argument. Since we call it inside
 # a "@torch.jit.script" function, it's difficult to make this decision at call time.
 if version.parse(torch.__version__) >= version.parse("1.10.0"):
+
     def meshgrid(x, y):
         return torch.meshgrid((x, y), indexing="ij")
+
+
 else:
     meshgrid = torch.meshgrid
 
@@ -133,3 +135,22 @@ def is_inside_box(points, boxes):
     rb = boxes[..., 2:] - points  # [boxes, points, 2]
     deltas = torch.cat((lt, rb), -1)  # [boxes, points, 4]
     return deltas.min(-1).values > 0.0  # [boxes, points]
+
+
+@torch.jit.script
+def get_image_size(images: Tensor) -> Tensor:
+    """Get the image size from an input tensor.
+
+    The function needs the ``@torch.jit.script`` decorator in order for ONNX generation to work. The tracing based
+    generator will loose track of e.g. ``images.shape[1]`` and treat it as a Python variable and not a tensor. This will
+    cause the dimension to be treated as a constant in the model, which prevents dynamic input sizes.
+
+    Args:
+        images: An image batch to take the width and height from.
+
+    Returns:
+        A tensor that contains the image width and height.
+    """
+    height = images.shape[2]
+    width = images.shape[3]
+    return torch.tensor([width, height], device=images.device)
diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/yolo_layers.py
index d17b2fe242..fab39be6ce 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/yolo_layers.py
@@ -1,10 +1,16 @@
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import Tensor, nn
 from torchvision.ops import box_convert
 
+from pl_bolts.models.detection.yolo.target_matching import (
+    HighestIoUMatching,
+    IoUThresholdMatching,
+    SimOTAMatching,
+    SizeRatioMatching,
+)
 from pl_bolts.models.detection.yolo.utils import global_xy
 from pl_bolts.models.detection.yolo.yolo_loss import LossFunction
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
@@ -17,8 +23,8 @@ class DetectionLayer(nn.Module):
 
     Args:
         num_classes: Number of different classes that this layer predicts.
-        anchor_dims: A list of the anchor box dimensions for this layer. The list should contain (width, height) tuples
-            in the network input resolution (relative to the width and height defined in the configuration file).
+        prior_shapes: A list of prior box dimensions for this layer, used for scaling the predicted dimensions. The list
+            should contain (width, height) tuples in the network input resolution.
         matching_func: The matching algorithm to be used for assigning targets to anchors.
         loss_func: ``LossFunction`` object for calculating the losses.
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
@@ -31,7 +37,7 @@ class DetectionLayer(nn.Module):
     def __init__(
         self,
         num_classes: int,
-        anchor_dims: List[Tuple[int, int]],
+        prior_shapes: List[Tuple[int, int]],
         matching_func: Callable,
         loss_func: LossFunction,
         xy_scale: float = 1.0,
@@ -43,7 +49,7 @@ def __init__(
             raise ModuleNotFoundError("YOLO model uses `torchvision`, which is not installed yet.")
 
         self.num_classes = num_classes
-        self.anchor_dims = anchor_dims
+        self.prior_shapes = prior_shapes
         self.matching_func = matching_func
         self.loss_func = loss_func
         self.xy_scale = xy_scale
@@ -63,7 +69,7 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
 
         Args:
             x: The output from the previous layer. Tensor of size
-                ``[batch_size, boxes_per_cell * (num_classes + 5), height, width]``.
+                ``[batch_size, anchors_per_cell * (num_classes + 5), height, width]``.
             image_size: Image width and height in a vector (defines the scale of the predicted and target coordinates).
             targets: If set, computes losses from detection layers against these targets. A list of target dictionaries,
                 one for each image.
@@ -73,16 +79,16 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         """
         batch_size, num_features, height, width = x.shape
         num_attrs = self.num_classes + 5
-        boxes_per_cell = num_features // num_attrs
-        if boxes_per_cell != len(self.anchor_dims):
+        anchors_per_cell = num_features // num_attrs
+        if anchors_per_cell != len(self.prior_shapes):
             raise MisconfigurationException(
-                "The model predicts {} bounding boxes per cell, but {} anchor boxes are defined "
-                "for this layer.".format(boxes_per_cell, len(self.anchor_dims))
+                "The model predicts {} bounding boxes per cell, but {} anchor box dimensions are defined for this "
+                "layer.".format(anchors_per_cell, len(self.prior_shapes))
             )
 
         # Reshape the output to have the bounding box attributes of each grid cell on its own row.
-        x = x.permute(0, 2, 3, 1)  # [batch_size, height, width, boxes_per_cell * num_attrs]
-        x = x.view(batch_size, height, width, boxes_per_cell, num_attrs)
+        x = x.permute(0, 2, 3, 1)  # [batch_size, height, width, anchors_per_cell * num_attrs]
+        x = x.view(batch_size, height, width, anchors_per_cell, num_attrs)
 
         # Take the sigmoid of the bounding box coordinates, confidence score, and class probabilities, unless the input
         # is normalized by the previous layer activation. Confidence and class losses use the unnormalized values if
@@ -101,13 +107,13 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
 
         image_xy = global_xy(xy, image_size)
         if self.input_is_normalized:
-            image_wh = 4 * torch.square(wh) * torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
+            image_wh = 4 * torch.square(wh) * torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
         else:
-            image_wh = torch.exp(wh) * torch.tensor(self.anchor_dims, dtype=wh.dtype, device=wh.device)
+            image_wh = torch.exp(wh) * torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
         box = torch.cat((image_xy, image_wh), -1)
         box = box_convert(box, in_fmt="cxcywh", out_fmt="xyxy")
         output = torch.cat((box, norm_confidence.unsqueeze(-1), norm_classprob), -1)
-        output = output.reshape(batch_size, height * width * boxes_per_cell, num_attrs)
+        output = output.reshape(batch_size, height * width * anchors_per_cell, num_attrs)
 
         if targets is not None:
             # We want to use binary_cross_entropy_with_logits, so we'll use the unnormalized confidence and classprob,
@@ -170,11 +176,45 @@ def _calculate_losses(
         self.hits = len(matched_targets["boxes"])
 
 
-class Mish(nn.Module):
-    """Mish activation."""
+class Conv(nn.Module):
+    """A convolutional layer with optional layer normalization and activation.
+
+    Args:
+        in_channels: Number of input channels that the layer expects.
+        out_channels: Number of output channels that the convolution produces.
+        kernel_size: Size of the convolving kernel.
+        stride: Stride of the convolution.
+        padding: Padding added to all four sides of the input.
+        bias: If ``True``, adds a learnable bias to the output.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 1,
+        stride: int = 1,
+        padding: Optional[int] = None,
+        bias: bool = False,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+
+        if padding is None:
+            padding = kernel_size // 2
+
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
+        self.norm = create_normalization_module(norm, out_channels)
+        self.act = create_activation_module(activation)
 
     def forward(self, x):
-        return x * torch.tanh(nn.functional.softplus(x))
+        x = self.conv(x)
+        x = self.norm(x)
+        return self.act(x)
 
 
 class RouteLayer(nn.Module):
@@ -210,3 +250,113 @@ def __init__(self, source_layer: int) -> None:
 
     def forward(self, x, outputs):
         return outputs[-1] + outputs[self.source_layer]
+
+
+class Mish(nn.Module):
+    """Mish activation."""
+
+    def forward(self, x):
+        return x * torch.tanh(nn.functional.softplus(x))
+
+
+def create_activation_module(name: Optional[str]) -> nn.Module:
+    """Creates a layer activation module given its type as a string.
+
+    Args:
+        name: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic", "linear",
+            or "none".
+    """
+    if name == "relu":
+        return nn.ReLU(inplace=True)
+    if name == "leaky":
+        return nn.LeakyReLU(0.1, inplace=True)
+    if name == "mish":
+        return Mish()
+    if name == "silu" or name == "swish":
+        return nn.SiLU(inplace=True)
+    if name == "logistic":
+        return nn.Sigmoid()
+    if name == "linear" or name == "none" or name is None:
+        return nn.Identity()
+    raise ValueError(f"Activation type `{name}´ is unknown.")
+
+
+def create_normalization_module(name: Optional[str], num_channels: int) -> nn.Module:
+    """Creates a layer normalization module given its type as a string.
+
+    Group normalization uses always 8 channels. The most common network widths are divisible by this number.
+
+    Args:
+        name: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        num_channels: The number of input channels that the module expects.
+    """
+    if name == "batchnorm":
+        return nn.BatchNorm2d(num_channels, eps=0.001)
+    if name == "groupnorm":
+        return nn.GroupNorm(8, num_channels, eps=0.001)
+    if name == "none" or name is None:
+        return nn.Identity()
+    raise ValueError(f"Normalization layer type `{name}´ is unknown.")
+
+
+def create_detection_layer(
+    prior_shapes: List[Tuple[int, int]],
+    prior_shape_idxs: List[int],
+    matching_algorithm: Optional[str] = None,
+    matching_threshold: Optional[float] = None,
+    ignore_bg_threshold: float = 0.7,
+    overlap_func: Union[str, Callable] = "ciou",
+    predict_overlap: float = 1.0,
+    overlap_loss_multiplier: float = 5.0,
+    confidence_loss_multiplier: float = 1.0,
+    class_loss_multiplier: float = 1.0,
+    **kwargs,
+) -> Tuple[Callable, LossFunction]:
+    """Creates a detection layer module and the required loss function and target matching objects.
+
+    Args:
+        prior_shapes: A list of all the prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain (width, height) tuples in the network input
+            resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
+            "iou", "giou", "diou", and "ciou" (default).
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+    if matching_algorithm == "simota":
+        loss_func = LossFunction(
+            overlap_func, None, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
+        )
+        matching_func = SimOTAMatching(loss_func)
+    elif matching_algorithm == "size":
+        matching_func = SizeRatioMatching(prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold)
+    elif matching_algorithm == "iou":
+        matching_func = IoUThresholdMatching(prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold)
+    elif matching_algorithm == "maxiou" or matching_algorithm is None:
+        matching_func = HighestIoUMatching(prior_shapes, prior_shape_idxs, ignore_bg_threshold)
+    else:
+        raise ValueError(f"Matching algorithm `{matching_algorithm}´ is unknown.")
+
+    loss_func = LossFunction(
+        overlap_func, predict_overlap, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
+    )
+
+    layer_shapes = [prior_shapes[i] for i in prior_shape_idxs]
+    return DetectionLayer(prior_shapes=layer_shapes, matching_func=matching_func, loss_func=loss_func, **kwargs)
diff --git a/pl_bolts/models/detection/yolo/yolo_loss.py b/pl_bolts/models/detection/yolo/yolo_loss.py
index 5f3f38476a..db261723fd 100644
--- a/pl_bolts/models/detection/yolo/yolo_loss.py
+++ b/pl_bolts/models/detection/yolo/yolo_loss.py
@@ -79,19 +79,20 @@ class LossFunction:
     """A class for calculating the YOLO losses from predictions and targets.
 
     Args:
-        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. The function has to
-            return a tensor with as many elements as there are input boxes.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
+            "iou", "giou", "diou", and "ciou" (default).
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
-        overlap_loss_multiplier: Multiply the overlap loss by this factor.
-        confidence_loss_multiplier: Multiply the confidence loss by this factor.
-        class_loss_multiplier: Multiply the classification loss by this factor.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
     """
 
     def __init__(
         self,
-        overlap_func: Union[str, Callable] = "iou",
+        overlap_func: Union[str, Callable] = "ciou",
         predict_overlap: Optional[float] = None,
         overlap_multiplier: float = 1.0,
         confidence_multiplier: float = 1.0,
@@ -108,7 +109,7 @@ def __init__(
         elif callable(overlap_func):
             self.overlap_func = overlap_func
         else:
-            raise ValueError("Unknown overlap function: " + overlap_func)
+            raise ValueError(f"Overlap function type `{overlap_func}´ is unknown.")
 
         self.predict_overlap = predict_overlap
 
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index eaf5a38a6c..fc7a242e81 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -1,19 +1,15 @@
-import io
 from copy import copy
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
-import numpy as np
 import torch
 import torch.nn as nn
 from pytorch_lightning import LightningModule
 from pytorch_lightning.utilities.cli import LightningCLI
-from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info
 from torch import Tensor, optim
 
 from pl_bolts.datamodules import VOCDetectionDataModule
 from pl_bolts.datamodules.vocdetection_datamodule import Compose
-from pl_bolts.models.detection.yolo.darknet_configuration import DarknetConfiguration
-from pl_bolts.models.detection.yolo.yolo_layers import DetectionLayer, RouteLayer, ShortcutLayer
+from pl_bolts.models.detection.yolo.darknet_network import DarknetNetwork
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
@@ -40,9 +36,9 @@ class YOLO(LightningModule):
 
     *Implementation*: `Seppo Enarvi <https://github.com/senarvi>`_
 
-    The network architecture can be read from a Darknet configuration file using the
-    :class:`~pl_bolts.models.detection.yolo.darknet_configuration.DarknetConfiguration` class, or created by some other
-    means, and provided as a list of PyTorch modules.
+    The network architecture can be written in PyTorch, or read from a Darknet configuration file using the
+    :class:`~pl_bolts.models.detection.yolo.darknet_network.DarknetNetwork` class. ``DarknetNetwork`` is also able to
+    read weights from a Darknet model file.
 
     The input from the data loader is expected to be a list of images. Each image is a tensor with shape
     ``[channels, height, width]``. The images from a single batch will be stacked into a single tensor, so the sizes
@@ -70,11 +66,10 @@ class YOLO(LightningModule):
     - scores (``FloatTensor[N]``): detection confidences
     - labels (``Int64Tensor[N]``): the predicted labels for each object
 
-    Weights can be loaded from a Darknet model file using ``load_darknet_weights()``.
-
     Args:
-        network: A list of network modules. This can be obtained from a Darknet configuration using the
-            :func:`~pl_bolts.models.detection.yolo.yolo_config.YOLOConfiguration.get_network` method.
+        network: A module that represents the network layers. This can be obtained from a Darknet configuration using
+            :func:`~pl_bolts.models.detection.yolo.darknet_network.DarknetNetwork`, or it can be defined as PyTorch
+            code.
         optimizer: Which optimizer class to use for training.
         optimizer_params: Parameters to pass to the optimizer constructor. Weight decay will be applied only to
             convolutional layer weights.
@@ -132,49 +127,7 @@ def forward(self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = N
             detection layers times the number of boxes predicted by one cell. The predicted box coordinates are in
             `(x1, y1, x2, y2)` format and scaled to the input image size.
         """
-        outputs = []  # Outputs from all layers
-        detections = []  # Outputs from detection layers
-        losses = []  # Losses from detection layers
-        hits = []  # Number of targets each detection layer was responsible for
-
-        @torch.jit.script
-        def get_image_size(images: Tensor) -> Tensor:
-            """Get the image size from an input tensor.
-
-            The function needs the ``@torch.jit.script`` decorator in order for ONNX generation to work. The tracing
-            based generator will loose track of e.g. ``images.shape[1]`` and treat it as a Python variable and not a
-            tensor. This will cause the dimension to be treated as a constant in the model, which prevents dynamic
-            input sizes.
-
-            Args:
-                images: An image batch to take the width and height from.
-
-            Returns:
-                A tensor that contains the image width and height.
-            """
-            height = images.shape[2]
-            width = images.shape[3]
-            return torch.tensor([width, height], device=images.device)
-
-        image_size = get_image_size(images)
-
-        x = images
-        for layer in self.network:
-            if isinstance(layer, (RouteLayer, ShortcutLayer)):
-                x = layer(x, outputs)
-            elif isinstance(layer, DetectionLayer):
-                if targets is None:
-                    x = layer(x, image_size)
-                    detections.append(x)
-                else:
-                    x = layer(x, image_size, targets)
-                    detections.append(x)
-                    losses.append(layer.losses)
-                    hits.append(layer.hits)
-            else:
-                x = layer(x)
-
-            outputs.append(x)
+        detections, losses, hits = self.network(images, targets)
 
         detections = torch.cat(detections, 1)
         if targets is None:
@@ -293,63 +246,6 @@ def infer(self, image: Tensor) -> Dict[str, Tensor]:
         detections = self.process_detections(detections)
         return detections[0]
 
-    def load_darknet_weights(self, weight_file):
-        """Loads weights to layer modules from a pretrained Darknet model.
-
-        One may want to continue training from pretrained weights, on a dataset with a different number of object
-        categories. The number of kernels in the convolutional layers just before each detection layer depends on the
-        number of output classes. The Darknet solution is to truncate the weight file and stop reading weights at the
-        first incompatible layer. For this reason the function silently leaves the rest of the layers unchanged, when
-        the weight file ends.
-
-        Args:
-            weight_file: A file object containing model weights in the Darknet binary format.
-        """
-        if not isinstance(weight_file, io.IOBase):
-            raise ValueError("weight_file must be a file-like object.")
-
-        version = np.fromfile(weight_file, count=3, dtype=np.int32)
-        images_seen = np.fromfile(weight_file, count=1, dtype=np.int64)
-        rank_zero_info(
-            f"Loading weights from Darknet model version {version[0]}.{version[1]}.{version[2]} "
-            f"that has been trained on {images_seen[0]} images."
-        )
-
-        def read(tensor):
-            """Reads the contents of ``tensor`` from the current position of ``weight_file``.
-
-            If there's no more data in ``weight_file``, returns without error.
-            """
-            x = np.fromfile(weight_file, count=tensor.numel(), dtype=np.float32)
-            if x.size > 0:
-                x = torch.from_numpy(x).view_as(tensor)
-                with torch.no_grad():
-                    tensor.copy_(x)
-            return x.size
-
-        for layer_idx, layer in enumerate(self.network):
-            # Weights are loaded only to convolutional layers
-            if not (isinstance(layer, nn.Sequential) and isinstance(layer[0], nn.Conv2d)):
-                continue
-
-            conv = layer[0]
-            rank_zero_debug(f"Reading weights for layer {layer_idx}: {list(conv.weight.shape)}")
-
-            # Convolution may be followed by batch normalization, in which case we read the batch
-            # normalization parameters and not the convolution bias.
-            if len(layer) > 1 and isinstance(layer[1], nn.BatchNorm2d):
-                bn = layer[1]
-                read(bn.bias)
-                read(bn.weight)
-                read(bn.running_mean)
-                read(bn.running_var)
-            else:
-                read(conv.bias)
-
-            read_count = read(conv.weight)
-            if read_count == 0:
-                return
-
     def process_detections(self, preds: Tensor) -> List[Dict[str, Tensor]]:
         """Splits the detection tensor returned by a forward pass into a list of prediction dictionaries, and
         filters them based on confidence threshold, non-maximum suppression (NMS), and maximum number of
@@ -459,7 +355,7 @@ class DarknetYOLO(YOLO):
     """A subclass of YOLO that uses a Darknet configuration file and can be configured using LightningCLI.
 
     At most one matching algorithm, ``match_sim_ota``, ``match_size_ratio``, or ``match_iou_threshold`` can be
-    specified. If none of them is given, the default algorithm is used, which matche a target to the prior shape
+    specified. If none of them is given, the default algorithm is used, which matches a target to the prior shape
     (anchor) that gives the highest IoU.
 
     CLI command::
@@ -476,10 +372,11 @@ class DarknetYOLO(YOLO):
             smaller than this ratio. If ``match_size_ratio`` or ``match_iou_threshold`` is not specified, selects for
             each target the anchor with the highest IoU.
         match_iou_threshold: If specified, matches a target to an anchor if the IoU is higher than this threshold.
-        ignore_iou_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
-        overlap_loss: A function that will return the overlap loss given predicted and target boxes.
+        overlap_func: Which function to use for calculating the overlap between boxes. Valid values are "iou", "giou",
+            "diou", and "ciou".
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
@@ -491,23 +388,26 @@ class DarknetYOLO(YOLO):
     def __init__(
         self,
         network_config: str,
+        darknet_weights: Optional[str] = None,
         match_sim_ota: bool = False,
         match_size_ratio: Optional[float] = None,
         match_iou_threshold: Optional[float] = None,
-        ignore_iou_threshold: Optional[float] = None,
-        overlap_loss: Optional[str] = None,
+        ignore_bg_threshold: Optional[float] = None,
+        overlap_func: Optional[str] = None,
         predict_overlap: Optional[float] = None,
         overlap_loss_multiplier: Optional[float] = None,
         class_loss_multiplier: Optional[float] = None,
         confidence_loss_multiplier: Optional[float] = None,
         **kwargs,
     ) -> None:
-        network = DarknetConfiguration(network_config).get_network(
+        network = DarknetNetwork(
+            network_config,
+            darknet_weights,
             match_sim_ota=match_sim_ota,
             match_size_ratio=match_size_ratio,
             match_iou_threshold=match_iou_threshold,
-            ignore_iou_threshold=ignore_iou_threshold,
-            overlap_loss=overlap_loss,
+            ignore_bg_threshold=ignore_bg_threshold,
+            overlap_func=overlap_func,
             predict_overlap=predict_overlap,
             overlap_loss_multiplier=overlap_loss_multiplier,
             class_loss_multiplier=class_loss_multiplier,
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index c15fa25f20..05d0cfbb2f 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -6,7 +6,15 @@
 from torch.utils.data import DataLoader
 
 from pl_bolts.datasets import DummyDetectionDataset
-from pl_bolts.models.detection import YOLO, DarknetConfiguration, FasterRCNN, RetinaNet
+from pl_bolts.models.detection import (
+    YOLO,
+    DarknetNetwork,
+    FasterRCNN,
+    RetinaNet,
+    YOLOV4TinyNetwork,
+    YOLOV5Network,
+    YOLOXNetwork,
+)
 from pl_bolts.models.detection.faster_rcnn import create_fasterrcnn_backbone
 from pl_bolts.models.detection.yolo.target_matching import _sim_ota_match
 from pl_bolts.models.detection.yolo.utils import (
@@ -204,22 +212,79 @@ def test_iou_below():
     assert not result[3, 5, 1]
 
 
-def test_yolo(tmpdir):
+def test_darknet(tmpdir):
     config_path = Path(TEST_ROOT) / "data" / "yolo.cfg"
-    config = DarknetConfiguration(config_path)
-    model = YOLO(config.get_network())
+    network = DarknetNetwork(config_path)
+    model = YOLO(network)
 
     image = torch.rand(1, 3, 256, 256)
     model(image)
 
 
-def test_yolo_train(tmpdir):
+def test_darknet_train(tmpdir):
     config_path = Path(TEST_ROOT) / "data" / "yolo.cfg"
-    config = DarknetConfiguration(config_path)
-    model = YOLO(config.get_network())
+    network = DarknetNetwork(config_path)
+    model = YOLO(network)
 
-    train_dl = DataLoader(DummyDetectionDataset(), collate_fn=_collate_fn)
-    valid_dl = DataLoader(DummyDetectionDataset(), collate_fn=_collate_fn)
+    train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+    valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
 
     trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
-    trainer.fit(model, train_dataloader=train_dl, val_dataloaders=valid_dl)
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
+
+
+def test_yolov4(tmpdir):
+    network = YOLOV4TinyNetwork(num_classes=2, width=4)
+    model = YOLO(network)
+
+    image = torch.rand(1, 3, 256, 256)
+    model(image)
+
+
+def test_yolov4_train(tmpdir):
+    network = YOLOV4TinyNetwork(num_classes=2, width=4)
+    model = YOLO(network)
+
+    train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+    valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
+
+
+def test_yolov5(tmpdir):
+    network = YOLOV5Network(num_classes=2, depth=1, width=4)
+    model = YOLO(network)
+
+    image = torch.rand(1, 3, 256, 256)
+    model(image)
+
+
+def test_yolov5_train(tmpdir):
+    network = YOLOV5Network(num_classes=2, depth=1, width=4)
+    model = YOLO(network)
+
+    train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+    valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
+
+
+def test_yolox(tmpdir):
+    network = YOLOXNetwork(num_classes=2, depth=1, width=4)
+    model = YOLO(network)
+
+    image = torch.rand(1, 3, 256, 256)
+    model(image)
+
+
+def test_yolox_train(tmpdir):
+    network = YOLOXNetwork(num_classes=2, depth=1, width=4)
+    model = YOLO(network)
+
+    train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+    valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)

From b237099f5a0429f5bfdc9ef99cbb9ad8506c3c06 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 28 Mar 2022 13:13:32 +0300
Subject: [PATCH 06/76] Improvements to YOLO

* Calculates MAP metric using TorchMetrics.
* Convolutional and max pooling layers automatically add the correct amount of padding.
* Added YOLOv4 network.
---
 pl_bolts/models/detection/__init__.py         |  16 +-
 .../models/detection/yolo/darknet_network.py  |  35 +-
 .../yolo/{yolo_layers.py => layers.py}        |  62 ++-
 .../detection/yolo/{yolo_loss.py => loss.py}  |   0
 .../models/detection/yolo/target_matching.py  |   4 +-
 .../models/detection/yolo/torch_networks.py   | 426 ++++++++++++++----
 pl_bolts/models/detection/yolo/yolo_module.py |  34 +-
 7 files changed, 457 insertions(+), 120 deletions(-)
 rename pl_bolts/models/detection/yolo/{yolo_layers.py => layers.py} (86%)
 rename pl_bolts/models/detection/yolo/{yolo_loss.py => loss.py} (100%)

diff --git a/pl_bolts/models/detection/__init__.py b/pl_bolts/models/detection/__init__.py
index 1dca7e215e..7d8615c089 100644
--- a/pl_bolts/models/detection/__init__.py
+++ b/pl_bolts/models/detection/__init__.py
@@ -3,9 +3,11 @@
 from pl_bolts.models.detection.retinanet import RetinaNet
 from pl_bolts.models.detection.yolo.darknet_network import DarknetNetwork
 from pl_bolts.models.detection.yolo.torch_networks import (
-    CSPBackbone,
-    TinyBackbone,
+    YOLOV4Backbone,
+    YOLOV4Network,
+    YOLOV4TinyBackbone,
     YOLOV4TinyNetwork,
+    YOLOV5Backbone,
     YOLOV5Network,
     YOLOXNetwork,
 )
@@ -14,12 +16,14 @@
 __all__ = [
     "components",
     "FasterRCNN",
-    "YOLO",
+    "RetinaNet",
     "DarknetNetwork",
+    "YOLOV4Backbone",
+    "YOLOV4Network",
+    "YOLOV4TinyBackbone",
     "YOLOV4TinyNetwork",
+    "YOLOV5Backbone",
     "YOLOV5Network",
     "YOLOXNetwork",
-    "TinyBackbone",
-    "CSPBackbone",
-    "RetinaNet",
+    "YOLO",
 ]
diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index 7b7da45766..f94a860f16 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -1,6 +1,5 @@
 import io
 import re
-from collections import OrderedDict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 from warnings import warn
 
@@ -11,7 +10,8 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import Tensor
 
-from pl_bolts.models.detection.yolo import yolo_layers
+from pl_bolts.models.detection.yolo import layers
+from pl_bolts.models.detection.yolo.layers import MaxPool
 from pl_bolts.models.detection.yolo.utils import get_image_size
 
 
@@ -80,9 +80,9 @@ def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None)
         image_size = get_image_size(x)
 
         for layer in self.layers:
-            if isinstance(layer, (yolo_layers.RouteLayer, yolo_layers.ShortcutLayer)):
+            if isinstance(layer, (layers.RouteLayer, layers.ShortcutLayer)):
                 x = layer(x, outputs)
-            elif isinstance(layer, yolo_layers.DetectionLayer):
+            elif isinstance(layer, layers.DetectionLayer):
                 x = layer(x, image_size, targets)
                 detections.append(x)
                 if targets is not None:
@@ -131,7 +131,7 @@ def read(tensor):
 
         for layer_idx, layer in enumerate(self.layers):
             # Weights are loaded only to convolutional layers
-            if not isinstance(layer, yolo_layers.Conv):
+            if not isinstance(layer, layers.Conv):
                 continue
 
             rank_zero_debug(f"Reading weights for layer {layer_idx}: {list(layer.conv.weight.shape)}")
@@ -274,7 +274,7 @@ def _create_convolutional(config: Dict[str, Any], num_inputs: List[int], **kwarg
     batch_normalize = config.get("batch_normalize", False)
     padding = (config["size"] - 1) // 2 if config["pad"] else 0
 
-    layer = yolo_layers.Conv(
+    layer = layers.Conv(
         num_inputs[-1],
         config["filters"],
         kernel_size=config["size"],
@@ -292,22 +292,7 @@ def _create_maxpool(config: Dict[str, Any], num_inputs: List[int], **kwargs):
 
     Padding is added so that the output resolution will be the input resolution divided by stride, rounded upwards.
     """
-    kernel_size = config["size"]
-    padding = (kernel_size - 1) // 2
-    maxpool = nn.MaxPool2d(kernel_size, config["stride"], padding)
-    if kernel_size % 2 == 1:
-        return maxpool, num_inputs[-1]
-
-    # If the kernel size is an even number, we need one cell of extra padding, on top of the padding added by MaxPool2d
-    # on both sides.
-    layer = nn.Sequential(
-        OrderedDict(
-            [
-                ("pad", nn.ZeroPad2d((0, 1, 0, 1))),
-                ("maxpool", maxpool),
-            ]
-        )
-    )
+    layer = MaxPool(config["size"], config["stride"])
     return layer, num_inputs[-1]
 
 
@@ -319,7 +304,7 @@ def _create_route(config, num_inputs: List[int], **kwargs):
     last = len(num_inputs) - 1
     source_layers = [layer if layer >= 0 else last + layer for layer in config["layers"]]
 
-    layer = yolo_layers.RouteLayer(source_layers, num_chunks, chunk_idx)
+    layer = layers.RouteLayer(source_layers, num_chunks, chunk_idx)
 
     # The number of outputs of a source layer is the number of inputs of the next layer.
     num_outputs = sum(num_inputs[layer + 1] // num_chunks for layer in source_layers)
@@ -328,7 +313,7 @@ def _create_route(config, num_inputs: List[int], **kwargs):
 
 
 def _create_shortcut(config: Dict[str, Any], num_inputs: List[int], **kwargs):
-    layer = yolo_layers.ShortcutLayer(config["from"])
+    layer = layers.ShortcutLayer(config["from"])
     return layer, num_inputs[-1]
 
 
@@ -366,7 +351,7 @@ def _create_yolo(
     if class_loss_multiplier is None:
         class_loss_multiplier = config.get("cls_normalizer", 1.0)
 
-    layer = yolo_layers.create_detection_layer(
+    layer = layers.create_detection_layer(
         num_classes=config["classes"],
         prior_shapes=prior_shapes,
         prior_shape_idxs=config["mask"],
diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/layers.py
similarity index 86%
rename from pl_bolts/models/detection/yolo/yolo_layers.py
rename to pl_bolts/models/detection/yolo/layers.py
index fab39be6ce..8cfb497c6c 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -5,6 +5,7 @@
 from torch import Tensor, nn
 from torchvision.ops import box_convert
 
+from pl_bolts.models.detection.yolo.loss import LossFunction
 from pl_bolts.models.detection.yolo.target_matching import (
     HighestIoUMatching,
     IoUThresholdMatching,
@@ -12,10 +13,40 @@
     SizeRatioMatching,
 )
 from pl_bolts.models.detection.yolo.utils import global_xy
-from pl_bolts.models.detection.yolo.yolo_loss import LossFunction
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 
 
+def _get_padding(kernel_size, stride):
+    """Returns the amount of padding needed by convolutional and max pooling layers.
+
+    Determines the amount of padding needed to make the output size of the layer the input size divided by the stride.
+    The first value that the function returns is the amount of padding to be added to all sides of the input matrix
+    (``padding`` argument of the operation). If an uneven amount of padding is needed in different sides of the input,
+    the second variable that is returned is an ``nn.ZeroPad2d`` operation that adds an additional column and row of
+    padding. If the input size is not divisible by the stride, the output size will be rounded upwards.
+
+    Args:
+        kernel_size: Size of the kernel.
+        stride: Stride of the operation.
+
+    Returns:
+        padding, pad_op: The amount of padding to be added to all sides of the input and an ``nn.Identity`` or
+        ``nn.ZeroPad2d`` operation to add one more column and row of padding if necessary.
+    """
+    # The output size is generally (input_size + padding - max(kernel_size, stride)) / stride + 1 and we want to
+    # make it equal to input_size / stride.
+    padding, remainder = divmod(max(kernel_size, stride) - stride, 2)
+
+    # If the kernel size is an even number, we need one cell of extra padding, on top of the padding added by MaxPool2d
+    # on both sides.
+    if remainder == 0:
+        pad_op = nn.Identity()
+    else:
+        pad_op = nn.ZeroPad2d((0, 1, 0, 1))
+
+    return padding, pad_op
+
+
 class DetectionLayer(nn.Module):
     """A YOLO detection layer.
 
@@ -82,8 +113,8 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         anchors_per_cell = num_features // num_attrs
         if anchors_per_cell != len(self.prior_shapes):
             raise MisconfigurationException(
-                "The model predicts {} bounding boxes per cell, but {} anchor box dimensions are defined for this "
-                "layer.".format(anchors_per_cell, len(self.prior_shapes))
+                "The model predicts {} bounding boxes per spatial location, but {} prior box dimensions are defined "
+                "for this layer.".format(anchors_per_cell, len(self.prior_shapes))
             )
 
         # Reshape the output to have the bounding box attributes of each grid cell on its own row.
@@ -179,6 +210,9 @@ def _calculate_losses(
 class Conv(nn.Module):
     """A convolutional layer with optional layer normalization and activation.
 
+    If ``padding`` is ``None``, the module tries to add padding so much that the output size will be the input size
+    divided by the stride. If the input size is not divisible by the stride, the output size will be rounded upwards.
+
     Args:
         in_channels: Number of input channels that the layer expects.
         out_channels: Number of output channels that the convolution produces.
@@ -205,18 +239,38 @@ def __init__(
         super().__init__()
 
         if padding is None:
-            padding = kernel_size // 2
+            padding, self.pad = _get_padding(kernel_size, stride)
+        else:
+            self.pad = nn.Identity()
 
         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
         self.norm = create_normalization_module(norm, out_channels)
         self.act = create_activation_module(activation)
 
     def forward(self, x):
+        x = self.pad(x)
         x = self.conv(x)
         x = self.norm(x)
         return self.act(x)
 
 
+class MaxPool(nn.Module):
+    """A max pooling layer with padding.
+
+    The module tries to add padding so much that the output size will be the input size divided by the stride. If the
+    input size is not divisible by the stride, the output size will be rounded upwards.
+    """
+
+    def __init__(self, kernel_size: int, stride: int):
+        super().__init__()
+        padding, self.pad = _get_padding(kernel_size, stride)
+        self.maxpool = nn.MaxPool2d(kernel_size, stride, padding)
+
+    def forward(self, x):
+        x = self.pad(x)
+        return self.maxpool(x)
+
+
 class RouteLayer(nn.Module):
     """Route layer concatenates the output (or part of it) from given layers.
 
diff --git a/pl_bolts/models/detection/yolo/yolo_loss.py b/pl_bolts/models/detection/yolo/loss.py
similarity index 100%
rename from pl_bolts/models/detection/yolo/yolo_loss.py
rename to pl_bolts/models/detection/yolo/loss.py
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index bcbd6fad7c..5823917c44 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -4,8 +4,8 @@
 import torch
 from torch import Tensor
 
+from pl_bolts.models.detection.yolo.loss import LossFunction
 from pl_bolts.models.detection.yolo.utils import aligned_iou, grid_centers, iou_below, is_inside_box
-from pl_bolts.models.detection.yolo.yolo_loss import LossFunction
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
 
@@ -21,7 +21,7 @@ class ShapeMatching(ABC):
 
     Most YOLO variants match targets to anchors based on prior shapes that are assigned to the anchors in the model
     configuration. The subclasses of ``ShapeMatching`` implement matching rules that compare the width and height of
-    the targets to each prior shape (regardless of the grid cell where the target is). When the model includes multiple
+    the targets to each prior shape (regardless of the location where the target is). When the model includes multiple
     detection layers, different shapes are defined for each layer. Usually there are three detection layers and three
     prior shapes per layer.
 
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index 4065ae7f25..bc7eadb138 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -5,17 +5,19 @@
 import torch.nn as nn
 from torch import Tensor
 
+from pl_bolts.models.detection.yolo.layers import Conv, MaxPool, create_detection_layer
 from pl_bolts.models.detection.yolo.utils import get_image_size
-from pl_bolts.models.detection.yolo.yolo_layers import Conv, create_detection_layer
 
 
-class Bottleneck(nn.Module):
-    """A bottleneck from YOLOv5.
+class BottleneckBlock(nn.Module):
+    """A residual block with a bottleneck layer.
 
     Args:
-        in_channels: Number of input channels that the bottleneck expects.
-        out_channels: Number of output channels that the bottleneck produces.
-        shortcut: Whether the bottleneck should include a shortcut connection.
+        in_channels: Number of input channels that the block expects.
+        out_channels: Number of output channels that the block produces.
+        hidden_channels: Number of output channels the (hidden) bottleneck layer produces. By default the number of
+            output channels of the block.
+        shortcut: Whether the block should include a shortcut connection.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -25,14 +27,19 @@ def __init__(
         self,
         in_channels,
         out_channels,
+        hidden_channels: Optional[int] = None,
         shortcut: bool = True,
         activation: Optional[str] = "silu",
         norm: Optional[str] = "batchnorm",
     ):
         super().__init__()
+
+        if hidden_channels is None:
+            hidden_channels = out_channels
+
         self.convs = nn.Sequential(
-            Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm),
-            Conv(out_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=norm),
+            Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm),
+            Conv(hidden_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=norm),
         )
         self.shortcut = shortcut and in_channels == out_channels
 
@@ -41,11 +48,11 @@ def forward(self, x):
         return x + y if self.shortcut else y
 
 
-class TinyStage(nn.Module):
+class TinyBlock(nn.Module):
     """One stage of the "tiny" network architecture from YOLOv4.
 
     Args:
-        num_channels: Number of channels in the stage input and output.
+        num_channels: Number of channels in the input and output of the block.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -71,14 +78,16 @@ def forward(self, x):
         return self.mix(torch.cat((y2, y1), dim=1))
 
 
-class CSPStage(nn.Module):
+class CSPBlock(nn.Module):
     """One stage of a Cross Stage Partial Network (CSPNet).
 
+    Encapsulates a number of bottleneck blocks in the CSP structure.
+
     Args:
-        in_channels: Number of input channels that the stage expects.
-        out_channels: Number of output channels that the stage produces.
-        depth: Number of bottlenecks that the stage contains.
-        shortcut: Whether the bottlenecks should include a shortcut connection.
+        in_channels: Number of input channels that the CSP block expects.
+        out_channels: Number of output channels that the CSP block produces.
+        depth: Number of bottleneck blocks that the CSP block contains.
+        shortcut: Whether the bottleneck blocks should include a shortcut connection.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -96,15 +105,15 @@ def __init__(
         super().__init__()
         # Instead of splitting the N output channels of a convolution into two parts, we can equivalently perform two
         # convolutions with N/2 output channels.
-        self.split1 = Conv(in_channels, out_channels // 2, kernel_size=1, stride=1, activation=activation, norm=norm)
-        self.split2 = Conv(in_channels, out_channels // 2, kernel_size=1, stride=1, activation=activation, norm=norm)
-        self.mix = Conv(out_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
-        self.bottlenecks = nn.Sequential(
-            *(
-                Bottleneck(out_channels // 2, out_channels // 2, shortcut, norm=norm, activation=activation)
-                for _ in range(depth)
-            )
-        )
+        hidden_channels = out_channels // 2
+        self.split1 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.split2 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        bottlenecks = [
+            BottleneckBlock(hidden_channels, hidden_channels, shortcut=shortcut, norm=norm, activation=activation)
+            for _ in range(depth)
+        ]
+        self.bottlenecks = nn.Sequential(*bottlenecks)
+        self.mix = Conv(hidden_channels * 2, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
 
     def forward(self, x):
         y1 = self.bottlenecks(self.split1(x))
@@ -135,7 +144,7 @@ def __init__(
         super().__init__()
         hidden_channels = in_channels // 2
         self.conv = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
-        self.maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
+        self.maxpool = MaxPool(kernel_size=kernel_size, stride=1)
         self.mix = Conv(hidden_channels * 4, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
 
     def forward(self, x):
@@ -146,7 +155,7 @@ def forward(self, x):
         return self.mix(torch.cat((y1, y2, y3, y4), dim=1))
 
 
-class TinyBackbone(nn.Module):
+class YOLOV4TinyBackbone(nn.Module):
     """Backbone of the "tiny" network architecture from YOLOv4.
 
     Args:
@@ -184,7 +193,7 @@ def maxpool(out_channels):
                 OrderedDict(
                     [
                         ("pad", nn.ZeroPad2d((0, 1, 0, 1))),
-                        ("maxpool", nn.MaxPool2d(kernel_size=2, stride=2, padding=0)),
+                        ("maxpool", MaxPool(kernel_size=2, stride=2)),
                         ("smooth", smooth(out_channels)),
                     ]
                 )
@@ -192,11 +201,11 @@ def maxpool(out_channels):
 
         self.stage1 = Conv(3, width, kernel_size=3, stride=2, activation=activation, norm=normalization)
         self.downsample2 = downsample(width, width * 2)
-        self.stage2 = TinyStage(width * 2, activation=activation, norm=normalization)
+        self.stage2 = TinyBlock(width * 2, activation=activation, norm=normalization)
         self.downsample3 = maxpool(width * 4)
-        self.stage3 = TinyStage(width * 4, activation=activation, norm=normalization)
+        self.stage3 = TinyBlock(width * 4, activation=activation, norm=normalization)
         self.downsample4 = maxpool(width * 8)
-        self.stage4 = TinyStage(width * 8, activation=activation, norm=normalization)
+        self.stage4 = TinyBlock(width * 8, activation=activation, norm=normalization)
         self.downsample5 = maxpool(width * 16)
 
     def forward(self, x):
@@ -214,7 +223,87 @@ def forward(self, x):
         return c1, c2, c3, c4, c5
 
 
-class CSPBackbone(nn.Module):
+class YOLOV4Backbone(nn.Module):
+    """A backbone that approximately corresponds to the Cross Stage Partial Network from YOLOv4.
+
+    Args:
+        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
+            number of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        width: int = 32,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def downsample(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def csp(num_channels, depth):
+            return CSPBlock(num_channels, num_channels, depth=depth)
+
+        def spp(num_channels):
+            return FastSPP(num_channels, num_channels, kernel_size=5, activation=activation, norm=normalization)
+
+        self.stage1 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("stem", Conv(3, width, kernel_size=3, stride=1, activation=activation, norm=normalization)),
+                    ("downsample", downsample(width, width * 2)),
+                    ("csp", csp(width * 2, 1)),
+                ]
+            )
+        )
+        self.stage2 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("downsample", downsample(width * 2, width * 4)),
+                    ("csp", csp(width * 4, 2)),
+                ]
+            )
+        )
+        self.stage3 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("downsample", downsample(width * 4, width * 8)),
+                    ("csp", csp(width * 8, 8)),
+                ]
+            )
+        )
+        self.stage4 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("downsample", downsample(width * 8, width * 16)),
+                    ("csp", csp(width * 16, 8)),
+                ]
+            )
+        )
+        self.stage5 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("downsample", downsample(width * 16, width * 32)),
+                    ("csp", csp(width * 32, 4)),
+                    ("spp", spp(width * 32)),
+                ]
+            )
+        )
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+        c1 = self.stage1(x)
+        c2 = self.stage2(c1)
+        c3 = self.stage3(c2)
+        c4 = self.stage4(c3)
+        c5 = self.stage5(c4)
+        return c1, c2, c3, c4, c5
+
+
+class YOLOV5Backbone(nn.Module):
     """The Cross Stage Partial Network backbone from YOLOv5.
 
     Args:
@@ -235,16 +324,18 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        def downsample(in_channels, out_channels):
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+        def downsample(in_channels, out_channels, kernel_size=3):
+            return Conv(
+                in_channels, out_channels, kernel_size=kernel_size, stride=2, activation=activation, norm=normalization
+            )
 
         def csp(num_channels, depth):
-            return CSPStage(num_channels, num_channels, depth=depth)
+            return CSPBlock(num_channels, num_channels, depth=depth)
 
         def spp(num_channels):
             return FastSPP(num_channels, num_channels, kernel_size=5, activation=activation, norm=normalization)
 
-        self.stage1 = Conv(3, width, kernel_size=6, stride=2, padding=2, activation=activation, norm=normalization)
+        self.stage1 = downsample(3, width, kernel_size=6)
         self.stage2 = nn.Sequential(
             OrderedDict(
                 [
@@ -301,9 +392,9 @@ class YOLOV4TinyNetwork(nn.Module):
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
         prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
             matching the targets to the anchors. The list should contain (width, height) tuples in the network input
-            resolution. There should be `3N` tuples, where `N` defines the number of anchors per grid cell. They are
-            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
-            you typically want to sort the shapes from the smallest to the largest.
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per spatial location. They
+            are assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning
+            that you typically want to sort the shapes from the smallest to the largest.
         matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
             from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
@@ -364,16 +455,16 @@ def conv3x3(in_channels, out_channels):
             return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
 
         def linear(in_channels, out_channels):
-            return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)
+            return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=True)
 
         def detect(prior_shape_idxs):
             return create_detection_layer(
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
 
-        self.backbone = backbone or TinyBackbone(width=width, normalization=normalization, activation=activation)
+        self.backbone = backbone or YOLOV4TinyBackbone(width=width, normalization=normalization, activation=activation)
 
-        self.lateral5 = conv1x1(width * 16, width * 8)
+        self.fpn5 = conv1x1(width * 16, width * 8)
         self.out5 = nn.Sequential(
             conv3x3(width * 8, width * 16),
             linear(width * 16, num_outputs),
@@ -383,14 +474,14 @@ def detect(prior_shape_idxs):
             nn.Upsample(scale_factor=2, mode="nearest"),
         )
 
-        self.lateral4 = conv3x3(width * 12, width * 8)
+        self.fpn4 = conv3x3(width * 12, width * 8)
         self.out4 = linear(width * 8, num_outputs)
         self.upsample4 = nn.Sequential(
             conv1x1(width * 8, width * 2),
             nn.Upsample(scale_factor=2, mode="nearest"),
         )
 
-        self.lateral3 = conv3x3(width * 6, width * 4)
+        self.fpn3 = conv3x3(width * 6, width * 4)
         self.out3 = linear(width * 4, num_outputs)
 
         self.detect3 = detect([0, 1, 2])
@@ -406,11 +497,11 @@ def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None)
 
         c3, c4, c5 = self.backbone(x)[-3:]
 
-        p5 = self.lateral5(c5)
+        p5 = self.fpn5(c5)
         x = torch.cat((self.upsample5(p5), c4), dim=1)
-        p4 = self.lateral4(x)
+        p4 = self.fpn4(x)
         x = torch.cat((self.upsample4(p4), c3), dim=1)
-        p3 = self.lateral3(x)
+        p3 = self.fpn3(x)
 
         y = self.detect5(self.out5(p5), image_size, targets)
         detections.append(y)
@@ -433,6 +524,185 @@ def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None)
         return detections, losses, hits
 
 
+class YOLOV4Network(nn.Module):
+    """Network architecture that corresponds approximately to the Cross Stage Partial Network from YOLOv4.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
+            number of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain (width, height) tuples in the network input
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per spatial location. They
+            are assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning
+            that you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
+            "iou", "giou", "diou", and "ciou" (default).
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        width: int = 32,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: List[Tuple[int, int]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                (12, 16),
+                (19, 36),
+                (40, 28),
+                (36, 75),
+                (76, 55),
+                (72, 146),
+                (142, 110),
+                (192, 243),
+                (459, 401),
+            ]
+            anchors_per_cell = 3
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 3)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def downsample(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def conv1x1(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def conv3x3(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+
+        def linear(in_channels, out_channels):
+            return nn.Conv2d(in_channels, out_channels, kernel_size=1)
+
+        def block(in_channels, out_channels):
+            return CSPBlock(
+                in_channels,
+                out_channels,
+                depth=2,
+                shortcut=False,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def detect(prior_shape_idxs):
+            return create_detection_layer(
+                prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
+            )
+
+        self.backbone = backbone or YOLOV4Backbone(width=width, normalization=normalization, activation=activation)
+
+        self.pre3 = conv1x1(width * 8, width * 4)
+        self.fpn3 = block(width * 8, width * 8)
+        self.out3 = nn.Sequential(
+            conv3x3(width * 8, width * 8),
+            linear(width * 8, num_outputs),
+        )
+
+        self.pre4 = conv1x1(width * 16, width * 8)
+        self.fpn4 = block(width * 16, width * 16)
+        self.pan4 = block(width * 24, width * 16)
+        self.out4 = nn.Sequential(
+            conv3x3(width * 16, width * 16),
+            linear(width * 16, num_outputs),
+        )
+
+        self.pan5 = block(width * 48, width * 32)
+        self.out5 = nn.Sequential(
+            conv3x3(width * 32, width * 32),
+            linear(width * 32, num_outputs),
+        )
+
+        self.upsample4 = nn.Sequential(
+            conv1x1(width * 16, width * 4),
+            nn.Upsample(scale_factor=2, mode="nearest"),
+        )
+        self.upsample5 = nn.Sequential(
+            conv1x1(width * 32, width * 8),
+            nn.Upsample(scale_factor=2, mode="nearest"),
+        )
+
+        self.downsample3 = downsample(width * 8, width * 8)
+        self.downsample4 = downsample(width * 16, width * 16)
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+
+    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+        detections = []  # Outputs from detection layers
+        losses = []  # Losses from detection layers
+        hits = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5 = self.backbone(x)[-3:]
+
+        x = self.pre4(c4)
+        x = torch.cat((x, self.upsample5(c5)), dim=1)
+        p4 = self.fpn4(x)
+
+        x = self.pre3(c3)
+        x = torch.cat((x, self.upsample4(p4)), dim=1)
+        n3 = self.fpn3(x)
+
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.pan4(x)
+
+        x = torch.cat((self.downsample4(n4), c5), dim=1)
+        n5 = self.pan5(x)
+
+        y = self.detect3(self.out3(n3), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect3.losses)
+            hits.append(self.detect3.hits)
+
+        y = self.detect4(self.out4(n4), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect4.losses)
+            hits.append(self.detect4.hits)
+
+        y = self.detect5(self.out5(n5), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect5.losses)
+            hits.append(self.detect5.hits)
+
+        return detections, losses, hits
+
+
 class YOLOV5Network(nn.Module):
     """The YOLOv5 network architecture. Different variants (n/s/m/l/x) can be achieved by adjusting the ``depth``
     and ``width`` parameters.
@@ -448,9 +718,9 @@ class YOLOV5Network(nn.Module):
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
         prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
             matching the targets to the anchors. The list should contain (width, height) tuples in the network input
-            resolution. There should be `3N` tuples, where `N` defines the number of anchors per grid cell. They are
-            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
-            you typically want to sort the shapes from the smallest to the largest.
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per spatial location. They
+            are assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning
+            that you typically want to sort the shapes from the smallest to the largest.
         matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
             from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
@@ -514,8 +784,8 @@ def conv1x1(in_channels, out_channels):
         def linear(in_channels, out_channels):
             return nn.Conv2d(in_channels, out_channels, kernel_size=1)
 
-        def bottleneck(in_channels, out_channels):
-            return CSPStage(
+        def block(in_channels, out_channels):
+            return CSPBlock(
                 in_channels,
                 out_channels,
                 depth=depth,
@@ -529,22 +799,22 @@ def detect(prior_shape_idxs):
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
 
-        self.backbone = backbone or CSPBackbone(
+        self.backbone = backbone or YOLOV5Backbone(
             depth=depth, width=width, normalization=normalization, activation=activation
         )
 
-        self.lateral3 = bottleneck(width * 8, width * 4)
+        self.pan3 = block(width * 8, width * 4)
         self.out3 = linear(width * 4, num_outputs)
 
-        self.lateral4a = nn.Sequential(
-            bottleneck(width * 16, width * 8),
+        self.fpn4 = nn.Sequential(
+            block(width * 16, width * 8),
             conv1x1(width * 8, width * 4),
         )
-        self.lateral4b = bottleneck(width * 8, width * 8)
+        self.pan4 = block(width * 8, width * 8)
         self.out4 = linear(width * 8, num_outputs)
 
-        self.lateral5a = conv1x1(width * 16, width * 8)
-        self.lateral5b = bottleneck(width * 16, width * 16)
+        self.fpn5 = conv1x1(width * 16, width * 8)
+        self.pan5 = block(width * 16, width * 16)
         self.out5 = linear(width * 16, num_outputs)
 
         self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
@@ -565,16 +835,16 @@ def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None)
 
         c3, c4, c5 = self.backbone(x)[-3:]
 
-        p5 = self.lateral5a(c5)
+        p5 = self.fpn5(c5)
         x = torch.cat((self.upsample(p5), c4), dim=1)
-        p4 = self.lateral4a(x)
+        p4 = self.fpn4(x)
         x = torch.cat((self.upsample(p4), c3), dim=1)
 
-        n3 = self.lateral3(x)
+        n3 = self.pan3(x)
         x = torch.cat((self.downsample3(n3), p4), dim=1)
-        n4 = self.lateral4b(x)
+        n4 = self.pan4(x)
         x = torch.cat((self.downsample4(n4), p5), dim=1)
-        n5 = self.lateral5b(x)
+        n5 = self.pan5(x)
 
         y = self.detect3(self.out3(n3), image_size, targets)
         detections.append(y)
@@ -612,9 +882,9 @@ class YOLOXNetwork(nn.Module):
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
         prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
             matching the targets to the anchors. The list should contain (width, height) tuples in the network input
-            resolution. There should be `3N` tuples, where `N` defines the number of anchors per grid cell. They are
-            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
-            you typically want to sort the shapes from the smallest to the largest.
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per spatial location. They
+            are assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning
+            that you typically want to sort the shapes from the smallest to the largest.
         matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
             from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
@@ -671,7 +941,7 @@ def linear(in_channels, out_channels):
             return nn.Conv2d(in_channels, out_channels, kernel_size=1)
 
         def bottleneck(in_channels, out_channels):
-            return CSPStage(
+            return CSPBlock(
                 in_channels,
                 out_channels,
                 depth=depth,
@@ -685,11 +955,11 @@ def detect(prior_shape_idxs):
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
 
-        self.backbone = backbone or CSPBackbone(
+        self.backbone = backbone or YOLOV5Backbone(
             depth=depth, width=width, normalization=normalization, activation=activation
         )
 
-        self.lateral3 = bottleneck(width * 8, width * 4)
+        self.pan3 = bottleneck(width * 8, width * 4)
         self.out3_stem = conv1x1(width * 4, width * 4)
         self.out3_feat = nn.Sequential(
             conv3x3(width * 4, width * 4),
@@ -703,11 +973,11 @@ def detect(prior_shape_idxs):
             linear(width * 4, anchors_per_cell * num_classes),
         )
 
-        self.lateral4a = nn.Sequential(
+        self.fpn4 = nn.Sequential(
             bottleneck(width * 16, width * 8),
             conv1x1(width * 8, width * 4),
         )
-        self.lateral4b = bottleneck(width * 8, width * 8)
+        self.pan4 = bottleneck(width * 8, width * 8)
         self.out4_stem = conv1x1(width * 8, width * 4)
         self.out4_feat = nn.Sequential(
             conv3x3(width * 4, width * 4),
@@ -721,8 +991,8 @@ def detect(prior_shape_idxs):
             linear(width * 4, anchors_per_cell * num_classes),
         )
 
-        self.lateral5a = conv1x1(width * 16, width * 8)
-        self.lateral5b = bottleneck(width * 16, width * 16)
+        self.fpn5 = conv1x1(width * 16, width * 8)
+        self.pan5 = bottleneck(width * 16, width * 16)
         self.out5_stem = conv1x1(width * 16, width * 4)
         self.out5_feat = nn.Sequential(
             conv3x3(width * 4, width * 4),
@@ -754,16 +1024,16 @@ def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None)
 
         c3, c4, c5 = self.backbone(x)[-3:]
 
-        p5 = self.lateral5a(c5)
+        p5 = self.fpn5(c5)
         x = torch.cat((self.upsample(p5), c4), dim=1)
-        p4 = self.lateral4a(x)
+        p4 = self.fpn4(x)
         x = torch.cat((self.upsample(p4), c3), dim=1)
 
-        n3 = self.lateral3(x)
+        n3 = self.pan3(x)
         x = torch.cat((self.downsample3(n3), p4), dim=1)
-        n4 = self.lateral4b(x)
+        n4 = self.pan4(x)
         x = torch.cat((self.downsample4(n4), p5), dim=1)
-        n5 = self.lateral5b(x)
+        n5 = self.pan5(x)
 
         x = self.out3_stem(n3)
         features = self.out3_feat(x)
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index fc7a242e81..27fc50df25 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -6,6 +6,7 @@
 from pytorch_lightning import LightningModule
 from pytorch_lightning.utilities.cli import LightningCLI
 from torch import Tensor, optim
+from torchmetrics.detection.map import MAP
 
 from pl_bolts.datamodules import VOCDetectionDataModule
 from pl_bolts.datamodules.vocdetection_datamodule import Compose
@@ -107,12 +108,15 @@ def __init__(
         self.nms_threshold = nms_threshold
         self.detections_per_image = detections_per_image
 
+        self._val_map = MAP(compute_on_step=False)
+        self._test_map = MAP(compute_on_step=False)
+
     def forward(self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
         """Runs a forward pass through the network (all layers listed in ``self.network``), and if training targets
         are provided, computes the losses from the detection layers.
 
         Detections are concatenated from the detection layers. Each detection layer will produce a number of detections
-        that depends on the size of the feature map and the number of anchors per grid cell.
+        that depends on the size of the feature map and the number of anchors per feature map cell.
 
         Args:
             images: Images to be processed. Tensor of size
@@ -123,8 +127,8 @@ def forward(self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = N
         Returns:
             detections (:class:`~torch.Tensor`), losses (Dict[str, :class:`~torch.Tensor`]): Detections, and if targets
             were provided, a dictionary of losses. Detections are shaped
-            ``[batch_size, predictors, classes + 5]``, where ``predictors`` is the total number of cells in all
-            detection layers times the number of boxes predicted by one cell. The predicted box coordinates are in
+            ``[batch_size, predictors, classes + 5]``, where ``predictors`` is the total number of feature map cells in
+            all detection layers times the number of anchors per cell. The predicted box coordinates are in
             `(x1, y1, x2, y2)` format and scaled to the input image size.
         """
         detections, losses, hits = self.network(images, targets)
@@ -201,13 +205,23 @@ def validation_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], b
             batch_idx: The index of this batch
         """
         images, targets = self._validate_batch(batch)
-        _, losses = self(images, targets)
+        detections, losses = self(images, targets)
 
         self.log("val/overlap_loss", losses[0], sync_dist=True)
         self.log("val/confidence_loss", losses[1], sync_dist=True)
         self.log("val/class_loss", losses[2], sync_dist=True)
         self.log("val/total_loss", losses.sum(), sync_dist=True)
 
+        detections = self.process_detections(detections)
+        targets = self.process_targets(targets)
+        self._val_map(detections, targets)
+
+    def validation_epoch_end(self, outputs):
+        map_scores = self._val_map.compute()
+        map_scores = {"val/" + k: v for k, v in map_scores.items()}
+        self.log_dict(map_scores, sync_dist=True)
+        self._val_map.reset()
+
     def test_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_idx: int):
         """Evaluates a batch of data from the test set.
 
@@ -217,13 +231,23 @@ def test_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_i
             batch_idx: The index of this batch.
         """
         images, targets = self._validate_batch(batch)
-        _, losses = self(images, targets)
+        detections, losses = self(images, targets)
 
         self.log("test/overlap_loss", losses[0], sync_dist=True)
         self.log("test/confidence_loss", losses[1], sync_dist=True)
         self.log("test/class_loss", losses[2], sync_dist=True)
         self.log("test/total_loss", losses.sum(), sync_dist=True)
 
+        detections = self.process_detections(detections)
+        targets = self.process_targets(targets)
+        self._test_map(detections, targets)
+
+    def test_epoch_end(self, outputs):
+        map_scores = self._test_map.compute()
+        map_scores = {"test/" + k: v for k, v in map_scores.items()}
+        self.log_dict(map_scores, sync_dist=True)
+        self._test_map.reset()
+
     def infer(self, image: Tensor) -> Dict[str, Tensor]:
         """Feeds an image to the network and returns the detected bounding boxes, confidence scores, and class
         labels.

From 82f4de179df617c1a3b872a1f86c2a8a50d7611b Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Tue, 29 Mar 2022 18:46:11 +0300
Subject: [PATCH 07/76] YOLO output layer name includes the number of outputs

Loading a checkpoint with a different number of output classes is possible with strict=False.
---
 .../models/detection/yolo/torch_networks.py   | 221 ++++++++----------
 pl_bolts/models/detection/yolo/yolo_module.py |  43 ++--
 pl_bolts/utils/__init__.py                    |   1 +
 tests/models/test_detection.py                |  24 +-
 4 files changed, 148 insertions(+), 141 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index bc7eadb138..67b3f37572 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -103,6 +103,7 @@ def __init__(
         norm: Optional[str] = "batchnorm",
     ):
         super().__init__()
+
         # Instead of splitting the N output channels of a convolution into two parts, we can equivalently perform two
         # convolutions with N/2 output channels.
         hidden_channels = out_channels // 2
@@ -179,14 +180,7 @@ def smooth(num_channels):
 
         def downsample(in_channels, out_channels):
             conv = Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
-            return nn.Sequential(
-                OrderedDict(
-                    [
-                        ("conv", conv),
-                        ("smooth", smooth(out_channels)),
-                    ]
-                )
-            )
+            return nn.Sequential(OrderedDict([("downsample", conv), ("smooth", smooth(out_channels))]))
 
         def maxpool(out_channels):
             return nn.Sequential(
@@ -448,14 +442,16 @@ def __init__(
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
         num_outputs = (5 + num_classes) * anchors_per_cell
 
-        def conv1x1(in_channels, out_channels):
-            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+        def conv(in_channels, out_channels, kernel_size=1):
+            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
 
-        def conv3x3(in_channels, out_channels):
-            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+        def upsample(in_channels, out_channels):
+            channels = conv(in_channels, out_channels)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
 
-        def linear(in_channels, out_channels):
-            return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=True)
+        def outputs(in_channels):
+            return nn.Conv2d(in_channels, num_outputs, kernel_size=1, stride=1, bias=True)
 
         def detect(prior_shape_idxs):
             return create_detection_layer(
@@ -464,25 +460,23 @@ def detect(prior_shape_idxs):
 
         self.backbone = backbone or YOLOV4TinyBackbone(width=width, normalization=normalization, activation=activation)
 
-        self.fpn5 = conv1x1(width * 16, width * 8)
+        self.fpn5 = conv(width * 16, width * 8)
         self.out5 = nn.Sequential(
-            conv3x3(width * 8, width * 16),
-            linear(width * 16, num_outputs),
-        )
-        self.upsample5 = nn.Sequential(
-            conv1x1(width * 8, width * 4),
-            nn.Upsample(scale_factor=2, mode="nearest"),
+            OrderedDict(
+                [
+                    ("channels", conv(width * 8, width * 16)),
+                    (f"outputs_{num_outputs}", outputs(width * 16)),
+                ]
+            )
         )
+        self.upsample5 = upsample(width * 8, width * 4)
 
-        self.fpn4 = conv3x3(width * 12, width * 8)
-        self.out4 = linear(width * 8, num_outputs)
-        self.upsample4 = nn.Sequential(
-            conv1x1(width * 8, width * 2),
-            nn.Upsample(scale_factor=2, mode="nearest"),
-        )
+        self.fpn4 = conv(width * 12, width * 8, kernel_size=3)
+        self.out4 = nn.Sequential(OrderedDict([(f"outputs_{num_outputs}", outputs(width * 8))]))
+        self.upsample4 = upsample(width * 8, width * 2)
 
-        self.fpn3 = conv3x3(width * 6, width * 4)
-        self.out3 = linear(width * 4, num_outputs)
+        self.fpn3 = conv(width * 6, width * 4, kernel_size=3)
+        self.out3 = nn.Sequential(OrderedDict([(f"outputs_{num_outputs}", outputs(width * 4))]))
 
         self.detect3 = detect([0, 1, 2])
         self.detect4 = detect([3, 4, 5])
@@ -593,19 +587,10 @@ def __init__(
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
         num_outputs = (5 + num_classes) * anchors_per_cell
 
-        def downsample(in_channels, out_channels):
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
-
-        def conv1x1(in_channels, out_channels):
+        def conv(in_channels, out_channels):
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
 
-        def conv3x3(in_channels, out_channels):
-            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
-
-        def linear(in_channels, out_channels):
-            return nn.Conv2d(in_channels, out_channels, kernel_size=1)
-
-        def block(in_channels, out_channels):
+        def csp(in_channels, out_channels):
             return CSPBlock(
                 in_channels,
                 out_channels,
@@ -615,6 +600,19 @@ def block(in_channels, out_channels):
                 activation=activation,
             )
 
+        def out(num_channels):
+            conv = Conv(num_channels, num_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            outputs = nn.Conv2d(num_channels, num_outputs, kernel_size=1)
+            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+
+        def upsample(in_channels, out_channels):
+            channels = conv(in_channels, out_channels)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
+
+        def downsample(in_channels, out_channels):
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
         def detect(prior_shape_idxs):
             return create_detection_layer(
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
@@ -622,35 +620,20 @@ def detect(prior_shape_idxs):
 
         self.backbone = backbone or YOLOV4Backbone(width=width, normalization=normalization, activation=activation)
 
-        self.pre3 = conv1x1(width * 8, width * 4)
-        self.fpn3 = block(width * 8, width * 8)
-        self.out3 = nn.Sequential(
-            conv3x3(width * 8, width * 8),
-            linear(width * 8, num_outputs),
-        )
+        self.pre3 = conv(width * 8, width * 4)
+        self.fpn3 = csp(width * 8, width * 8)
+        self.out3 = out(width * 8)
 
-        self.pre4 = conv1x1(width * 16, width * 8)
-        self.fpn4 = block(width * 16, width * 16)
-        self.pan4 = block(width * 24, width * 16)
-        self.out4 = nn.Sequential(
-            conv3x3(width * 16, width * 16),
-            linear(width * 16, num_outputs),
-        )
+        self.pre4 = conv(width * 16, width * 8)
+        self.fpn4 = csp(width * 16, width * 16)
+        self.pan4 = csp(width * 24, width * 16)
+        self.out4 = out(width * 16)
 
-        self.pan5 = block(width * 48, width * 32)
-        self.out5 = nn.Sequential(
-            conv3x3(width * 32, width * 32),
-            linear(width * 32, num_outputs),
-        )
+        self.pan5 = csp(width * 48, width * 32)
+        self.out5 = out(width * 32)
 
-        self.upsample4 = nn.Sequential(
-            conv1x1(width * 16, width * 4),
-            nn.Upsample(scale_factor=2, mode="nearest"),
-        )
-        self.upsample5 = nn.Sequential(
-            conv1x1(width * 32, width * 8),
-            nn.Upsample(scale_factor=2, mode="nearest"),
-        )
+        self.upsample4 = upsample(width * 16, width * 4)
+        self.upsample5 = upsample(width * 32, width * 8)
 
         self.downsample3 = downsample(width * 8, width * 8)
         self.downsample4 = downsample(width * 16, width * 16)
@@ -778,13 +761,14 @@ def __init__(
         def downsample(in_channels, out_channels):
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
-        def conv1x1(in_channels, out_channels):
+        def conv(in_channels, out_channels):
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
 
-        def linear(in_channels, out_channels):
-            return nn.Conv2d(in_channels, out_channels, kernel_size=1)
+        def out(in_channels):
+            outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
+            return nn.Sequential(OrderedDict([(f"outputs_{num_outputs}", outputs)]))
 
-        def block(in_channels, out_channels):
+        def csp(in_channels, out_channels):
             return CSPBlock(
                 in_channels,
                 out_channels,
@@ -803,19 +787,23 @@ def detect(prior_shape_idxs):
             depth=depth, width=width, normalization=normalization, activation=activation
         )
 
-        self.pan3 = block(width * 8, width * 4)
-        self.out3 = linear(width * 4, num_outputs)
+        self.pan3 = csp(width * 8, width * 4)
+        self.out3 = out(width * 4)
 
         self.fpn4 = nn.Sequential(
-            block(width * 16, width * 8),
-            conv1x1(width * 8, width * 4),
+            OrderedDict(
+                [
+                    ("csp", csp(width * 16, width * 8)),
+                    ("conv", conv(width * 8, width * 4)),
+                ]
+            )
         )
-        self.pan4 = block(width * 8, width * 8)
-        self.out4 = linear(width * 8, num_outputs)
+        self.pan4 = csp(width * 8, width * 8)
+        self.out4 = out(width * 8)
 
-        self.fpn5 = conv1x1(width * 16, width * 8)
-        self.pan5 = block(width * 16, width * 16)
-        self.out5 = linear(width * 16, num_outputs)
+        self.fpn5 = conv(width * 16, width * 8)
+        self.pan5 = csp(width * 16, width * 16)
+        self.out5 = out(width * 16)
 
         self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
 
@@ -931,16 +919,13 @@ def __init__(
         def downsample(in_channels, out_channels):
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
-        def conv1x1(in_channels, out_channels):
-            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
-
-        def conv3x3(in_channels, out_channels):
-            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+        def conv(in_channels, out_channels, kernel_size=1):
+            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
 
         def linear(in_channels, out_channels):
             return nn.Conv2d(in_channels, out_channels, kernel_size=1)
 
-        def bottleneck(in_channels, out_channels):
+        def csp(in_channels, out_channels):
             return CSPBlock(
                 in_channels,
                 out_channels,
@@ -950,6 +935,17 @@ def bottleneck(in_channels, out_channels):
                 activation=activation,
             )
 
+        def features(num_channels):
+            return nn.Sequential(
+                conv(num_channels, num_channels, kernel_size=3),
+                conv(num_channels, num_channels, kernel_size=3),
+            )
+
+        def classprob(num_channels):
+            num_outputs = anchors_per_cell * num_classes
+            outputs = linear(num_channels, num_outputs)
+            return nn.Sequential(OrderedDict([("convs", features(num_channels)), (f"outputs_{num_outputs}", outputs)]))
+
         def detect(prior_shape_idxs):
             return create_detection_layer(
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
@@ -959,52 +955,35 @@ def detect(prior_shape_idxs):
             depth=depth, width=width, normalization=normalization, activation=activation
         )
 
-        self.pan3 = bottleneck(width * 8, width * 4)
-        self.out3_stem = conv1x1(width * 4, width * 4)
-        self.out3_feat = nn.Sequential(
-            conv3x3(width * 4, width * 4),
-            conv3x3(width * 4, width * 4),
-        )
+        self.pan3 = csp(width * 8, width * 4)
+        self.out3_stem = conv(width * 4, width * 4)
+        self.out3_feat = features(width * 4)
         self.out3_box = linear(width * 4, anchors_per_cell * 4)
         self.out3_confidence = linear(width * 4, anchors_per_cell)
-        self.out3_classprob = nn.Sequential(
-            conv3x3(width * 4, width * 4),
-            conv3x3(width * 4, width * 4),
-            linear(width * 4, anchors_per_cell * num_classes),
-        )
+        self.out3_classprob = classprob(width * 4)
 
         self.fpn4 = nn.Sequential(
-            bottleneck(width * 16, width * 8),
-            conv1x1(width * 8, width * 4),
-        )
-        self.pan4 = bottleneck(width * 8, width * 8)
-        self.out4_stem = conv1x1(width * 8, width * 4)
-        self.out4_feat = nn.Sequential(
-            conv3x3(width * 4, width * 4),
-            conv3x3(width * 4, width * 4),
+            OrderedDict(
+                [
+                    ("csp", csp(width * 16, width * 8)),
+                    ("conv", conv(width * 8, width * 4)),
+                ]
+            )
         )
+        self.pan4 = csp(width * 8, width * 8)
+        self.out4_stem = conv(width * 8, width * 4)
+        self.out4_feat = features(width * 4)
         self.out4_box = linear(width * 4, anchors_per_cell * 4)
         self.out4_confidence = linear(width * 4, anchors_per_cell)
-        self.out4_classprob = nn.Sequential(
-            conv3x3(width * 4, width * 4),
-            conv3x3(width * 4, width * 4),
-            linear(width * 4, anchors_per_cell * num_classes),
-        )
+        self.out4_classprob = classprob(width * 4)
 
-        self.fpn5 = conv1x1(width * 16, width * 8)
-        self.pan5 = bottleneck(width * 16, width * 16)
-        self.out5_stem = conv1x1(width * 16, width * 4)
-        self.out5_feat = nn.Sequential(
-            conv3x3(width * 4, width * 4),
-            conv3x3(width * 4, width * 4),
-        )
+        self.fpn5 = conv(width * 16, width * 8)
+        self.pan5 = csp(width * 16, width * 16)
+        self.out5_stem = conv(width * 16, width * 4)
+        self.out5_feat = features(width * 4)
         self.out5_box = linear(width * 4, anchors_per_cell * 4)
         self.out5_confidence = linear(width * 4, anchors_per_cell)
-        self.out5_classprob = nn.Sequential(
-            conv3x3(width * 4, width * 4),
-            conv3x3(width * 4, width * 4),
-            linear(width * 4, anchors_per_cell * num_classes),
-        )
+        self.out5_classprob = classprob(width * 4)
 
         self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
 
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 27fc50df25..78c4167fa3 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -6,15 +6,17 @@
 from pytorch_lightning import LightningModule
 from pytorch_lightning.utilities.cli import LightningCLI
 from torch import Tensor, optim
-from torchmetrics.detection.map import MAP
 
 from pl_bolts.datamodules import VOCDetectionDataModule
 from pl_bolts.datamodules.vocdetection_datamodule import Compose
 from pl_bolts.models.detection.yolo.darknet_network import DarknetNetwork
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
-from pl_bolts.utils import _TORCHVISION_AVAILABLE
+from pl_bolts.utils import _TORCHMETRICS_DETECTION_AVAILABLE, _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
 
+if _TORCHMETRICS_DETECTION_AVAILABLE:
+    from torchmetrics.detection.map import MAP
+
 if _TORCHVISION_AVAILABLE:
     from torchvision.ops import batched_nms
     from torchvision.transforms import functional as F
@@ -108,8 +110,9 @@ def __init__(
         self.nms_threshold = nms_threshold
         self.detections_per_image = detections_per_image
 
-        self._val_map = MAP(compute_on_step=False)
-        self._test_map = MAP(compute_on_step=False)
+        if _TORCHMETRICS_DETECTION_AVAILABLE:
+            self._val_map = MAP(compute_on_step=False)
+            self._test_map = MAP(compute_on_step=False)
 
     def forward(self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
         """Runs a forward pass through the network (all layers listed in ``self.network``), and if training targets
@@ -212,15 +215,17 @@ def validation_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], b
         self.log("val/class_loss", losses[2], sync_dist=True)
         self.log("val/total_loss", losses.sum(), sync_dist=True)
 
-        detections = self.process_detections(detections)
-        targets = self.process_targets(targets)
-        self._val_map(detections, targets)
+        if _TORCHMETRICS_DETECTION_AVAILABLE:
+            detections = self.process_detections(detections)
+            targets = self.process_targets(targets)
+            self._val_map(detections, targets)
 
     def validation_epoch_end(self, outputs):
-        map_scores = self._val_map.compute()
-        map_scores = {"val/" + k: v for k, v in map_scores.items()}
-        self.log_dict(map_scores, sync_dist=True)
-        self._val_map.reset()
+        if _TORCHMETRICS_DETECTION_AVAILABLE:
+            map_scores = self._val_map.compute()
+            map_scores = {"val/" + k: v for k, v in map_scores.items()}
+            self.log_dict(map_scores, sync_dist=True)
+            self._val_map.reset()
 
     def test_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_idx: int):
         """Evaluates a batch of data from the test set.
@@ -238,15 +243,17 @@ def test_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_i
         self.log("test/class_loss", losses[2], sync_dist=True)
         self.log("test/total_loss", losses.sum(), sync_dist=True)
 
-        detections = self.process_detections(detections)
-        targets = self.process_targets(targets)
-        self._test_map(detections, targets)
+        if _TORCHMETRICS_DETECTION_AVAILABLE:
+            detections = self.process_detections(detections)
+            targets = self.process_targets(targets)
+            self._test_map(detections, targets)
 
     def test_epoch_end(self, outputs):
-        map_scores = self._test_map.compute()
-        map_scores = {"test/" + k: v for k, v in map_scores.items()}
-        self.log_dict(map_scores, sync_dist=True)
-        self._test_map.reset()
+        if _TORCHMETRICS_DETECTION_AVAILABLE:
+            map_scores = self._test_map.compute()
+            map_scores = {"test/" + k: v for k, v in map_scores.items()}
+            self.log_dict(map_scores, sync_dist=True)
+            self._test_map.reset()
 
     def infer(self, image: Tensor) -> Dict[str, Tensor]:
         """Feeds an image to the network and returns the detected bounding boxes, confidence scores, and class
diff --git a/pl_bolts/utils/__init__.py b/pl_bolts/utils/__init__.py
index aceba1cf0e..ad1db94847 100644
--- a/pl_bolts/utils/__init__.py
+++ b/pl_bolts/utils/__init__.py
@@ -32,6 +32,7 @@ def _compare_version(package: str, op: Callable, version: str) -> bool:
 _NATIVE_AMP_AVAILABLE: bool = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast")
 
 _TORCHVISION_AVAILABLE: bool = _module_available("torchvision")
+_TORCHMETRICS_DETECTION_AVAILABLE: bool = _module_available("torchmetrics.detection")
 _GYM_AVAILABLE: bool = _module_available("gym")
 _SKLEARN_AVAILABLE: bool = _module_available("sklearn")
 _PIL_AVAILABLE: bool = _module_available("PIL")
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 05d0cfbb2f..cd5217ebbd 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -11,6 +11,7 @@
     DarknetNetwork,
     FasterRCNN,
     RetinaNet,
+    YOLOV4Network,
     YOLOV4TinyNetwork,
     YOLOV5Network,
     YOLOXNetwork,
@@ -233,7 +234,7 @@ def test_darknet_train(tmpdir):
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
-def test_yolov4(tmpdir):
+def test_yolov4_tiny(tmpdir):
     network = YOLOV4TinyNetwork(num_classes=2, width=4)
     model = YOLO(network)
 
@@ -241,7 +242,7 @@ def test_yolov4(tmpdir):
     model(image)
 
 
-def test_yolov4_train(tmpdir):
+def test_yolov4_tiny_train(tmpdir):
     network = YOLOV4TinyNetwork(num_classes=2, width=4)
     model = YOLO(network)
 
@@ -252,6 +253,25 @@ def test_yolov4_train(tmpdir):
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
+def test_yolov4(tmpdir):
+    network = YOLOV4Network(num_classes=2, width=4)
+    model = YOLO(network)
+
+    image = torch.rand(1, 3, 256, 256)
+    model(image)
+
+
+def test_yolov4_train(tmpdir):
+    network = YOLOV4Network(num_classes=2, width=4)
+    model = YOLO(network)
+
+    train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+    valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
+
+
 def test_yolov5(tmpdir):
     network = YOLOV5Network(num_classes=2, depth=1, width=4)
     model = YOLO(network)

From 8a201a01c8ebe516457e2f2a7694ca89fe225529 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 14 Apr 2022 12:59:53 +0200
Subject: [PATCH 08/76] Complete type hints

---
 .../models/detection/yolo/darknet_network.py  |  85 ++++++++------
 pl_bolts/models/detection/yolo/layers.py      |  35 +++---
 pl_bolts/models/detection/yolo/loss.py        |  21 +++-
 .../models/detection/yolo/target_matching.py  |  32 +++---
 .../models/detection/yolo/torch_networks.py   | 107 +++++++++---------
 pl_bolts/models/detection/yolo/types.py       |   7 ++
 pl_bolts/models/detection/yolo/utils.py       |  14 +--
 pl_bolts/models/detection/yolo/yolo_module.py |  37 +++---
 8 files changed, 190 insertions(+), 148 deletions(-)
 create mode 100644 pl_bolts/models/detection/yolo/types.py

diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index f94a860f16..fa9897d145 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -12,13 +12,18 @@
 
 from pl_bolts.models.detection.yolo import layers
 from pl_bolts.models.detection.yolo.layers import MaxPool
+from pl_bolts.models.detection.yolo.torch_networks import NETWORK_OUTPUT
+from pl_bolts.models.detection.yolo.types import TARGETS
 from pl_bolts.models.detection.yolo.utils import get_image_size
 
+CONFIG = Dict[str, Any]
+CREATE_LAYER_OUTPUT = Tuple[nn.Module, int]  # layer, num_outputs
+
 
 class DarknetNetwork(nn.Module):
     """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation."""
 
-    def __init__(self, config_path: str, weights_path: Optional[str] = None, **kwargs) -> None:
+    def __init__(self, config_path: str, weights_path: Optional[str] = None, **kwargs: Any) -> None:
         """Parses a Darknet configuration file and creates the network structure.
 
         Iterates through the layers from the configuration and creates corresponding PyTorch modules. If
@@ -63,19 +68,19 @@ def __init__(self, config_path: str, weights_path: Optional[str] = None, **kwarg
         num_inputs = [global_config.get("channels", 3)]
         for layer_config in layer_configs:
             config = {**global_config, **layer_config}
-            module, num_outputs = _create_layer(config, num_inputs, **kwargs)
-            self.layers.append(module)
+            layer, num_outputs = _create_layer(config, num_inputs, **kwargs)
+            self.layers.append(layer)
             num_inputs.append(num_outputs)
 
         if weights_path is not None:
             with open(weights_path) as weight_file:
                 self.load_weights(weight_file)
 
-    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
-        outputs = []  # Outputs from all layers
-        detections = []  # Outputs from detection layers
-        losses = []  # Losses from detection layers
-        hits = []  # Number of targets each detection layer was responsible for
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        outputs: List[Tensor] = []  # Outputs from all layers
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
 
         image_size = get_image_size(x)
 
@@ -95,7 +100,7 @@ def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None)
 
         return detections, losses, hits
 
-    def load_weights(self, weight_file: io.IOBase):
+    def load_weights(self, weight_file: io.IOBase) -> None:
         """Loads weights to layer modules from a pretrained Darknet model.
 
         One may want to continue training from pretrained weights, on a dataset with a different number of object
@@ -117,17 +122,18 @@ def load_weights(self, weight_file: io.IOBase):
             f"that has been trained on {images_seen[0]} images."
         )
 
-        def read(tensor):
+        def read(tensor: Tensor) -> int:
             """Reads the contents of ``tensor`` from the current position of ``weight_file``.
 
-            If there's no more data in ``weight_file``, returns without error.
+            Returns the number of elements read. If there's no more data in ``weight_file``, returns 0.
             """
             x = np.fromfile(weight_file, count=tensor.numel(), dtype=np.float32)
-            if x.size > 0:
+            num_elements = x.size
+            if num_elements > 0:
                 x = torch.from_numpy(x).view_as(tensor)
                 with torch.no_grad():
                     tensor.copy_(x)
-            return x.size
+            return num_elements
 
         for layer_idx, layer in enumerate(self.layers):
             # Weights are loaded only to convolutional layers
@@ -139,8 +145,12 @@ def read(tensor):
             # If convolution is followed by batch normalization, read the batch normalization parameters. Otherwise we
             # read the convolution bias.
             if isinstance(layer.norm, nn.Identity):
+                assert layer.conv.bias is not None
                 read(layer.conv.bias)
             else:
+                assert isinstance(layer.norm, nn.BatchNorm2d)
+                assert layer.norm.running_mean is not None
+                assert layer.norm.running_var is not None
                 read(layer.norm.bias)
                 read(layer.norm.weight)
                 read(layer.norm.running_mean)
@@ -151,7 +161,7 @@ def read(tensor):
                 return
 
     def _read_config(self, config_file: Iterable[str]) -> List[Dict[str, Any]]:
-        """Reads a YOLOv4 network configuration file and returns a list of configuration sections.
+        """Reads a Darnet network configuration file and returns a list of configuration sections.
 
         Args:
             config_file: The configuration file to read.
@@ -214,16 +224,15 @@ def _read_config(self, config_file: Iterable[str]) -> List[Dict[str, Any]]:
         section = None
         sections = []
 
-        def convert(key, value):
+        def convert(key: str, value: str) -> Union[str, int, float, List[Union[str, int, float]]]:
             """Converts a value to the correct type based on key."""
             if key not in variable_types:
                 warn("Unknown YOLO configuration variable: " + key)
-                return key, value
+                return value
             if key in list_variables:
-                value = [variable_types[key](v) for v in value.split(",")]
+                return [variable_types[key](v) for v in value.split(",")]
             else:
-                value = variable_types[key](value)
-            return key, value
+                return variable_types[key](value)
 
         for line in config_file:
             line = line.strip()
@@ -236,18 +245,19 @@ def convert(key, value):
                     sections.append(section)
                 section = {"type": section_match.group(1)}
             else:
+                if section is None:
+                    raise RuntimeError("Darknet network configuration file does not start with a section header.")
                 key, value = line.split("=")
                 key = key.rstrip()
                 value = value.lstrip()
-                key, value = convert(key, value)
-                section[key] = value
+                section[key] = convert(key, value)
         if section is not None:
             sections.append(section)
 
         return sections
 
 
-def _create_layer(config: Dict[str, Any], num_inputs: List[int], **kwargs) -> Tuple[nn.Module, int]:
+def _create_layer(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
     """Calls one of the ``_create_<layertype>(config, num_inputs)`` functions to create a PyTorch module from the
     layer config.
 
@@ -259,7 +269,7 @@ def _create_layer(config: Dict[str, Any], num_inputs: List[int], **kwargs) -> Tu
         module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
         its output.
     """
-    create_func = {
+    create_func: Dict[str, Callable[..., CREATE_LAYER_OUTPUT]] = {
         "convolutional": _create_convolutional,
         "maxpool": _create_maxpool,
         "route": _create_route,
@@ -270,7 +280,7 @@ def _create_layer(config: Dict[str, Any], num_inputs: List[int], **kwargs) -> Tu
     return create_func[config["type"]](config, num_inputs, **kwargs)
 
 
-def _create_convolutional(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+def _create_convolutional(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
     batch_normalize = config.get("batch_normalize", False)
     padding = (config["size"] - 1) // 2 if config["pad"] else 0
 
@@ -287,7 +297,7 @@ def _create_convolutional(config: Dict[str, Any], num_inputs: List[int], **kwarg
     return layer, config["filters"]
 
 
-def _create_maxpool(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+def _create_maxpool(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
     """Creates a max pooling layer.
 
     Padding is added so that the output resolution will be the input resolution divided by stride, rounded upwards.
@@ -296,7 +306,7 @@ def _create_maxpool(config: Dict[str, Any], num_inputs: List[int], **kwargs):
     return layer, num_inputs[-1]
 
 
-def _create_route(config, num_inputs: List[int], **kwargs):
+def _create_route(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
     num_chunks = config.get("groups", 1)
     chunk_idx = config.get("group_id", 0)
 
@@ -312,44 +322,49 @@ def _create_route(config, num_inputs: List[int], **kwargs):
     return layer, num_outputs
 
 
-def _create_shortcut(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+def _create_shortcut(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
     layer = layers.ShortcutLayer(config["from"])
     return layer, num_inputs[-1]
 
 
-def _create_upsample(config: Dict[str, Any], num_inputs: List[int], **kwargs):
+def _create_upsample(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
     layer = nn.Upsample(scale_factor=config["stride"], mode="nearest")
     return layer, num_inputs[-1]
 
 
 def _create_yolo(
-    config: Dict[str, Any],
+    config: CONFIG,
     num_inputs: List[int],
     prior_shapes: Optional[List[Tuple[int, int]]] = None,
     matching_algorithm: Optional[str] = None,
     matching_threshold: Optional[float] = None,
     ignore_bg_threshold: Optional[float] = None,
     overlap_func: Optional[Union[str, Callable]] = None,
-    predict_overlap: Optional[float] = None,
+    predict_overlap: float = 1.0,
     overlap_loss_multiplier: Optional[float] = None,
     confidence_loss_multiplier: Optional[float] = None,
     class_loss_multiplier: Optional[float] = None,
-    **kwargs,
-):
+    **kwargs: Any,
+) -> CREATE_LAYER_OUTPUT:
     if prior_shapes is None:
         # The "anchors" list alternates width and height.
-        prior_shapes = config["anchors"]
-        prior_shapes = [(prior_shapes[i], prior_shapes[i + 1]) for i in range(0, len(prior_shapes), 2)]
+        dims = config["anchors"]
+        prior_shapes = [(dims[i], dims[i + 1]) for i in range(0, len(dims), 2)]
     if ignore_bg_threshold is None:
         ignore_bg_threshold = config.get("ignore_thresh", 1.0)
+        assert isinstance(ignore_bg_threshold, float)
     if overlap_func is None:
         overlap_func = config.get("iou_loss", "iou")
+        assert isinstance(overlap_func, str)
     if overlap_loss_multiplier is None:
         overlap_loss_multiplier = config.get("iou_normalizer", 1.0)
+        assert isinstance(overlap_loss_multiplier, float)
     if confidence_loss_multiplier is None:
         confidence_loss_multiplier = config.get("obj_normalizer", 1.0)
+        assert isinstance(confidence_loss_multiplier, float)
     if class_loss_multiplier is None:
         class_loss_multiplier = config.get("cls_normalizer", 1.0)
+        assert isinstance(class_loss_multiplier, float)
 
     layer = layers.create_detection_layer(
         num_classes=config["classes"],
@@ -366,4 +381,4 @@ def _create_yolo(
         xy_scale=config.get("scale_x_y", 1.0),
         input_is_normalized=config.get("new_coords", 0) > 0,
     )
-    return layer, None
+    return layer, 0
diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index 8cfb497c6c..82b80eb9bc 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -1,4 +1,4 @@
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -9,6 +9,7 @@
 from pl_bolts.models.detection.yolo.target_matching import (
     HighestIoUMatching,
     IoUThresholdMatching,
+    ShapeMatching,
     SimOTAMatching,
     SizeRatioMatching,
 )
@@ -16,7 +17,7 @@
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 
 
-def _get_padding(kernel_size, stride):
+def _get_padding(kernel_size: int, stride: int) -> Tuple[int, nn.Module]:
     """Returns the amount of padding needed by convolutional and max pooling layers.
 
     Determines the amount of padding needed to make the output size of the layer the input size divided by the stride.
@@ -39,10 +40,7 @@ def _get_padding(kernel_size, stride):
 
     # If the kernel size is an even number, we need one cell of extra padding, on top of the padding added by MaxPool2d
     # on both sides.
-    if remainder == 0:
-        pad_op = nn.Identity()
-    else:
-        pad_op = nn.ZeroPad2d((0, 1, 0, 1))
+    pad_op: nn.Module = nn.Identity() if remainder == 0 else nn.ZeroPad2d((0, 1, 0, 1))
 
     return padding, pad_op
 
@@ -159,7 +157,7 @@ def _calculate_losses(
         preds: List[Dict[str, Tensor]],
         targets: List[Dict[str, Tensor]],
         image_size: Tensor,
-    ):
+    ) -> None:
         """Matches the predictions to targets and calculates the losses. Creates the attributes ``losses`` and
         ``hits``. ``losses`` is a tensor of three elements: the overlap, confidence, and classification loss.
         ``hits`` is the number of targets that this layer was responsible for.
@@ -247,7 +245,7 @@ def __init__(
         self.norm = create_normalization_module(norm, out_channels)
         self.act = create_activation_module(activation)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         x = self.pad(x)
         x = self.conv(x)
         x = self.norm(x)
@@ -266,7 +264,7 @@ def __init__(self, kernel_size: int, stride: int):
         padding, self.pad = _get_padding(kernel_size, stride)
         self.maxpool = nn.MaxPool2d(kernel_size, stride, padding)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         x = self.pad(x)
         return self.maxpool(x)
 
@@ -286,7 +284,7 @@ def __init__(self, source_layers: List[int], num_chunks: int, chunk_idx: int) ->
         self.num_chunks = num_chunks
         self.chunk_idx = chunk_idx
 
-    def forward(self, x, outputs):
+    def forward(self, x: Tensor, outputs: List[Tensor]) -> Tensor:
         chunks = [torch.chunk(outputs[layer], self.num_chunks, dim=1)[self.chunk_idx] for layer in self.source_layers]
         return torch.cat(chunks, dim=1)
 
@@ -302,14 +300,14 @@ def __init__(self, source_layer: int) -> None:
         super().__init__()
         self.source_layer = source_layer
 
-    def forward(self, x, outputs):
+    def forward(self, x: Tensor, outputs: List[Tensor]) -> Tensor:
         return outputs[-1] + outputs[self.source_layer]
 
 
 class Mish(nn.Module):
     """Mish activation."""
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         return x * torch.tanh(nn.functional.softplus(x))
 
 
@@ -354,8 +352,8 @@ def create_normalization_module(name: Optional[str], num_channels: int) -> nn.Mo
 
 
 def create_detection_layer(
-    prior_shapes: List[Tuple[int, int]],
-    prior_shape_idxs: List[int],
+    prior_shapes: Sequence[Tuple[int, int]],
+    prior_shape_idxs: Sequence[int],
     matching_algorithm: Optional[str] = None,
     matching_threshold: Optional[float] = None,
     ignore_bg_threshold: float = 0.7,
@@ -364,8 +362,8 @@ def create_detection_layer(
     overlap_loss_multiplier: float = 5.0,
     confidence_loss_multiplier: float = 1.0,
     class_loss_multiplier: float = 1.0,
-    **kwargs,
-) -> Tuple[Callable, LossFunction]:
+    **kwargs: Any,
+) -> DetectionLayer:
     """Creates a detection layer module and the required loss function and target matching objects.
 
     Args:
@@ -394,14 +392,19 @@ def create_detection_layer(
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
             to produce coordinate values close to one.
     """
+    matching_func: Union[ShapeMatching, SimOTAMatching]
     if matching_algorithm == "simota":
         loss_func = LossFunction(
             overlap_func, None, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
         )
         matching_func = SimOTAMatching(loss_func)
     elif matching_algorithm == "size":
+        if matching_threshold is None:
+            raise ValueError("matching_threshold is required with size ratio matching.")
         matching_func = SizeRatioMatching(prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold)
     elif matching_algorithm == "iou":
+        if matching_threshold is None:
+            raise ValueError("matching_threshold is required with IoU threshold matching.")
         matching_func = IoUThresholdMatching(prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold)
     elif matching_algorithm == "maxiou" or matching_algorithm is None:
         matching_func = HighestIoUMatching(prior_shapes, prior_shape_idxs, ignore_bg_threshold)
diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index db261723fd..c5316c02d4 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -1,5 +1,5 @@
 import math
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Dict, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -143,7 +143,7 @@ def _calculate_overlap(
             overlap_loss = overlap_loss * size_compensation
         return overlap, overlap_loss
 
-    def _calculate_confidence(self, preds: Tensor, overlap: Tensor, bce_func: Callable):
+    def _calculate_confidence(self, preds: Tensor, overlap: Tensor, bce_func: Callable) -> Tensor:
         """Calculates the confidence loss for foreground anchors.
 
         If ``self.predict_overlap`` is ``True``, ``overlap`` will be used as the target confidence. Otherwise the target
@@ -179,7 +179,7 @@ def _calculate_confidence(self, preds: Tensor, overlap: Tensor, bce_func: Callab
 
         return result
 
-    def _calculate_bg_confidence(self, preds: Tensor, bce_func: Callable):
+    def _calculate_bg_confidence(self, preds: Tensor, bce_func: Callable) -> Tensor:
         """Calculates the confidence loss for background anchors."""
         targets = torch.zeros_like(preds)
         return bce_func(preds, targets, reduction="none")
@@ -218,7 +218,13 @@ def _calculate_class(self, preds: Tensor, targets: Tensor, bce_func: Callable) -
         preds, targets = torch.broadcast_tensors(preds, targets)
         return bce_func(preds, targets, reduction="none").sum(-1)
 
-    def __call__(self, preds, targets, input_is_normalized: bool, image_size: Optional[Tensor] = None):
+    def __call__(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        input_is_normalized: bool,
+        image_size: Optional[Tensor] = None,
+    ) -> None:
         """Calculates the losses for all pairs of a predictions and a target, and if `bg_confidences` appears in
         ``preds``, calculates the confidence loss for background predictions.
 
@@ -231,7 +237,10 @@ def __call__(self, preds, targets, input_is_normalized: bool, image_size: Option
             input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
             image_size: Width and height in a vector that defines the scale of the target coordinates.
         """
-        bce_func = binary_cross_entropy if input_is_normalized else binary_cross_entropy_with_logits
+        if input_is_normalized:
+            bce_func = binary_cross_entropy
+        else:
+            bce_func = binary_cross_entropy_with_logits
 
         overlap, overlap_loss = self._calculate_overlap(preds["boxes"], targets["boxes"], image_size)
         self.overlap = overlap
@@ -247,7 +256,7 @@ def __call__(self, preds, targets, input_is_normalized: bool, image_size: Option
         class_loss = self._calculate_class(preds["classprobs"], targets["labels"], bce_func)
         self.class_loss = class_loss * self.class_multiplier
 
-    def sums(self):
+    def sums(self) -> Tuple[Tensor, Tensor, Tensor]:
         """Returns the sums of the losses over prediction/target pairs, assuming the predictions and targets have
         been matched (there are as many predictions and targets)."""
         overlap_loss = self.overlap_loss.diagonal().sum()
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index 5823917c44..ba38e0c4f0 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
+from typing import Dict, Sequence, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -31,7 +31,7 @@ class ShapeMatching(ABC):
             the confidence loss.
     """
 
-    def __init__(self, ignore_bg_threshold: float = 0.7):
+    def __init__(self, ignore_bg_threshold: float = 0.7) -> None:
         self.ignore_bg_threshold = ignore_bg_threshold
 
     def __call__(
@@ -90,7 +90,7 @@ def __call__(
         return preds, targets
 
     @abstractmethod
-    def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
+    def match(self, wh: Tensor) -> Union[Tuple[Tensor, Tensor], Tensor]:
         """Selects anchors for each target based on the predicted shapes. The subclasses implement this method.
 
         Args:
@@ -119,8 +119,8 @@ class HighestIoUMatching(ShapeMatching):
     """
 
     def __init__(
-        self, prior_shapes: List[Tuple[int, int]], prior_shape_idxs: List[int], ignore_bg_threshold: float = 0.7
-    ):
+        self, prior_shapes: Sequence[Tuple[int, int]], prior_shape_idxs: Sequence[int], ignore_bg_threshold: float = 0.7
+    ) -> None:
         super().__init__(ignore_bg_threshold)
         self.prior_shapes = prior_shapes
         # anchor_map maps the anchor indices to predictors in this layer, or to -1 if it's not an anchor of this layer.
@@ -129,7 +129,7 @@ def __init__(
             prior_shape_idxs.index(idx) if idx in prior_shape_idxs else -1 for idx in range(len(prior_shapes))
         ]
 
-    def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
+    def match(self, wh: Tensor) -> Union[Tuple[Tensor, Tensor], Tensor]:
         prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
         anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=wh.device)
 
@@ -157,16 +157,16 @@ class IoUThresholdMatching(ShapeMatching):
 
     def __init__(
         self,
-        prior_shapes: List[Tuple[int, int]],
-        prior_shape_idxs: List[int],
+        prior_shapes: Sequence[Tuple[int, int]],
+        prior_shape_idxs: Sequence[int],
         threshold: float,
         ignore_bg_threshold: float = 0.7,
-    ):
+    ) -> None:
         super().__init__(ignore_bg_threshold)
         self.prior_shapes = [prior_shapes[idx] for idx in prior_shape_idxs]
         self.threshold = threshold
 
-    def match(self, wh):
+    def match(self, wh: Tensor) -> Union[Tuple[Tensor, Tensor], Tensor]:
         prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
 
         ious = aligned_iou(wh, prior_wh)
@@ -193,16 +193,16 @@ class SizeRatioMatching(ShapeMatching):
 
     def __init__(
         self,
-        prior_shapes: List[Tuple[int, int]],
-        prior_shape_idxs: List[int],
+        prior_shapes: Sequence[Tuple[int, int]],
+        prior_shape_idxs: Sequence[int],
         threshold: float,
         ignore_bg_threshold: float = 0.7,
-    ):
+    ) -> None:
         super().__init__(ignore_bg_threshold)
         self.prior_shapes = [prior_shapes[idx] for idx in prior_shape_idxs]
         self.threshold = threshold
 
-    def match(self, wh):
+    def match(self, wh: Tensor) -> Union[Tuple[Tensor, Tensor], Tensor]:
         prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
 
         wh_ratio = wh[:, None, :] / prior_wh[None, :, :]  # [num_targets, num_anchors, 2]
@@ -212,7 +212,7 @@ def match(self, wh):
         return below_threshold.T
 
 
-def _sim_ota_match(costs, ious):
+def _sim_ota_match(costs: Tensor, ious: Tensor) -> Tuple[Tensor, Tensor]:
     """Implements the SimOTA matching rule.
 
     The number of units supplied by each supplier (training target) needs to be decided in the Optimal Transport
@@ -261,7 +261,7 @@ class SimOTAMatching:
         loss_func: A ``LossFunction`` object that can be used to calculate the pairwise costs.
     """
 
-    def __init__(self, loss_func: LossFunction):
+    def __init__(self, loss_func: LossFunction) -> None:
         self.loss_func = loss_func
 
     def __call__(
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index 67b3f37572..9e63d2c94d 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -1,11 +1,12 @@
 from collections import OrderedDict
-from typing import Dict, List, Optional, Tuple
+from typing import Any, List, Optional, Sequence, Tuple
 
 import torch
 import torch.nn as nn
 from torch import Tensor
 
-from pl_bolts.models.detection.yolo.layers import Conv, MaxPool, create_detection_layer
+from pl_bolts.models.detection.yolo.layers import Conv, DetectionLayer, MaxPool, create_detection_layer
+from pl_bolts.models.detection.yolo.types import NETWORK_OUTPUT, TARGETS
 from pl_bolts.models.detection.yolo.utils import get_image_size
 
 
@@ -25,13 +26,13 @@ class BottleneckBlock(nn.Module):
 
     def __init__(
         self,
-        in_channels,
-        out_channels,
+        in_channels: int,
+        out_channels: int,
         hidden_channels: Optional[int] = None,
         shortcut: bool = True,
         activation: Optional[str] = "silu",
         norm: Optional[str] = "batchnorm",
-    ):
+    ) -> None:
         super().__init__()
 
         if hidden_channels is None:
@@ -43,7 +44,7 @@ def __init__(
         )
         self.shortcut = shortcut and in_channels == out_channels
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         y = self.convs(x)
         return x + y if self.shortcut else y
 
@@ -63,7 +64,7 @@ def __init__(
         num_channels: int,
         activation: Optional[str] = "leaky",
         norm: Optional[str] = "batchnorm",
-    ):
+    ) -> None:
         super().__init__()
 
         hidden_channels = num_channels // 2
@@ -71,7 +72,7 @@ def __init__(
         self.conv2 = Conv(hidden_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
         self.mix = Conv(num_channels, num_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         x = torch.chunk(x, 2, dim=1)[1]
         y1 = self.conv1(x)
         y2 = self.conv2(y1)
@@ -101,7 +102,7 @@ def __init__(
         shortcut: bool = True,
         activation: Optional[str] = "silu",
         norm: Optional[str] = "batchnorm",
-    ):
+    ) -> None:
         super().__init__()
 
         # Instead of splitting the N output channels of a convolution into two parts, we can equivalently perform two
@@ -116,7 +117,7 @@ def __init__(
         self.bottlenecks = nn.Sequential(*bottlenecks)
         self.mix = Conv(hidden_channels * 2, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         y1 = self.bottlenecks(self.split1(x))
         y2 = self.split2(x)
         return self.mix(torch.cat((y1, y2), dim=1))
@@ -148,7 +149,7 @@ def __init__(
         self.maxpool = MaxPool(kernel_size=kernel_size, stride=1)
         self.mix = Conv(hidden_channels * 4, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         y1 = self.conv(x)
         y2 = self.maxpool(y1)
         y3 = self.maxpool(y2)
@@ -175,14 +176,14 @@ def __init__(
     ):
         super().__init__()
 
-        def smooth(num_channels):
+        def smooth(num_channels: int) -> nn.Module:
             return Conv(num_channels, num_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
 
-        def downsample(in_channels, out_channels):
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
             conv = Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
             return nn.Sequential(OrderedDict([("downsample", conv), ("smooth", smooth(out_channels))]))
 
-        def maxpool(out_channels):
+        def maxpool(out_channels: int) -> nn.Module:
             return nn.Sequential(
                 OrderedDict(
                     [
@@ -202,7 +203,7 @@ def maxpool(out_channels):
         self.stage4 = TinyBlock(width * 8, activation=activation, norm=normalization)
         self.downsample5 = maxpool(width * 16)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
         c1 = self.stage1(x)
         x = self.downsample2(c1)
         c2 = self.stage2(x)
@@ -236,13 +237,13 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        def downsample(in_channels, out_channels):
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
-        def csp(num_channels, depth):
+        def csp(num_channels: int, depth: int) -> nn.Module:
             return CSPBlock(num_channels, num_channels, depth=depth)
 
-        def spp(num_channels):
+        def spp(num_channels: int) -> nn.Module:
             return FastSPP(num_channels, num_channels, kernel_size=5, activation=activation, norm=normalization)
 
         self.stage1 = nn.Sequential(
@@ -318,15 +319,15 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        def downsample(in_channels, out_channels, kernel_size=3):
+        def downsample(in_channels: int, out_channels: int, kernel_size: int = 3) -> nn.Module:
             return Conv(
                 in_channels, out_channels, kernel_size=kernel_size, stride=2, activation=activation, norm=normalization
             )
 
-        def csp(num_channels, depth):
+        def csp(num_channels: int, depth: int) -> nn.Module:
             return CSPBlock(num_channels, num_channels, depth=depth)
 
-        def spp(num_channels):
+        def spp(num_channels: int) -> nn.Module:
             return FastSPP(num_channels, num_channels, kernel_size=5, activation=activation, norm=normalization)
 
         self.stage1 = downsample(3, width, kernel_size=6)
@@ -418,7 +419,7 @@ def __init__(
         activation: Optional[str] = "leaky",
         normalization: Optional[str] = "batchnorm",
         prior_shapes: List[Tuple[int, int]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
 
@@ -442,18 +443,19 @@ def __init__(
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
         num_outputs = (5 + num_classes) * anchors_per_cell
 
-        def conv(in_channels, out_channels, kernel_size=1):
+        def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
 
-        def upsample(in_channels, out_channels):
+        def upsample(in_channels: int, out_channels: int) -> nn.Module:
             channels = conv(in_channels, out_channels)
             upsample = nn.Upsample(scale_factor=2, mode="nearest")
             return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
 
-        def outputs(in_channels):
+        def outputs(in_channels: int) -> nn.Module:
             return nn.Conv2d(in_channels, num_outputs, kernel_size=1, stride=1, bias=True)
 
-        def detect(prior_shape_idxs):
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
+            assert prior_shapes is not None
             return create_detection_layer(
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
@@ -482,7 +484,7 @@ def detect(prior_shape_idxs):
         self.detect4 = detect([3, 4, 5])
         self.detect5 = detect([6, 7, 8])
 
-    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
         detections = []  # Outputs from detection layers
         losses = []  # Losses from detection layers
         hits = []  # Number of targets each detection layer was responsible for
@@ -563,7 +565,7 @@ def __init__(
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
         prior_shapes: List[Tuple[int, int]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
 
@@ -587,10 +589,10 @@ def __init__(
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
         num_outputs = (5 + num_classes) * anchors_per_cell
 
-        def conv(in_channels, out_channels):
+        def conv(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
 
-        def csp(in_channels, out_channels):
+        def csp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPBlock(
                 in_channels,
                 out_channels,
@@ -600,20 +602,21 @@ def csp(in_channels, out_channels):
                 activation=activation,
             )
 
-        def out(num_channels):
+        def out(num_channels: int) -> nn.Module:
             conv = Conv(num_channels, num_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
             outputs = nn.Conv2d(num_channels, num_outputs, kernel_size=1)
             return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
 
-        def upsample(in_channels, out_channels):
+        def upsample(in_channels: int, out_channels: int) -> nn.Module:
             channels = conv(in_channels, out_channels)
             upsample = nn.Upsample(scale_factor=2, mode="nearest")
             return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
 
-        def downsample(in_channels, out_channels):
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
-        def detect(prior_shape_idxs):
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
+            assert prior_shapes is not None
             return create_detection_layer(
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
@@ -642,7 +645,7 @@ def detect(prior_shape_idxs):
         self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
         self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
 
-    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
         detections = []  # Outputs from detection layers
         losses = []  # Losses from detection layers
         hits = []  # Number of targets each detection layer was responsible for
@@ -734,7 +737,7 @@ def __init__(
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
         prior_shapes: List[Tuple[int, int]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
 
@@ -758,17 +761,17 @@ def __init__(
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
         num_outputs = (5 + num_classes) * anchors_per_cell
 
-        def downsample(in_channels, out_channels):
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
-        def conv(in_channels, out_channels):
+        def conv(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
 
-        def out(in_channels):
+        def out(in_channels: int) -> nn.Module:
             outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
             return nn.Sequential(OrderedDict([(f"outputs_{num_outputs}", outputs)]))
 
-        def csp(in_channels, out_channels):
+        def csp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPBlock(
                 in_channels,
                 out_channels,
@@ -778,7 +781,8 @@ def csp(in_channels, out_channels):
                 activation=activation,
             )
 
-        def detect(prior_shape_idxs):
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
+            assert prior_shapes is not None
             return create_detection_layer(
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
@@ -814,7 +818,7 @@ def detect(prior_shape_idxs):
         self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
         self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
 
-    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
         detections = []  # Outputs from detection layers
         losses = []  # Losses from detection layers
         hits = []  # Number of targets each detection layer was responsible for
@@ -903,7 +907,7 @@ def __init__(
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
         prior_shapes: List[Tuple[int, int]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
 
@@ -916,16 +920,16 @@ def __init__(
             if modulo != 0:
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
 
-        def downsample(in_channels, out_channels):
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
-        def conv(in_channels, out_channels, kernel_size=1):
+        def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
 
-        def linear(in_channels, out_channels):
+        def linear(in_channels: int, out_channels: int) -> nn.Module:
             return nn.Conv2d(in_channels, out_channels, kernel_size=1)
 
-        def csp(in_channels, out_channels):
+        def csp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPBlock(
                 in_channels,
                 out_channels,
@@ -935,18 +939,19 @@ def csp(in_channels, out_channels):
                 activation=activation,
             )
 
-        def features(num_channels):
+        def features(num_channels: int) -> nn.Module:
             return nn.Sequential(
                 conv(num_channels, num_channels, kernel_size=3),
                 conv(num_channels, num_channels, kernel_size=3),
             )
 
-        def classprob(num_channels):
+        def classprob(num_channels: int) -> nn.Module:
             num_outputs = anchors_per_cell * num_classes
             outputs = linear(num_channels, num_outputs)
             return nn.Sequential(OrderedDict([("convs", features(num_channels)), (f"outputs_{num_outputs}", outputs)]))
 
-        def detect(prior_shape_idxs):
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
+            assert prior_shapes is not None
             return create_detection_layer(
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
@@ -994,7 +999,7 @@ def detect(prior_shape_idxs):
         self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
         self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
 
-    def forward(self, x: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
         detections = []  # Outputs from detection layers
         losses = []  # Losses from detection layers
         hits = []  # Number of targets each detection layer was responsible for
diff --git a/pl_bolts/models/detection/yolo/types.py b/pl_bolts/models/detection/yolo/types.py
new file mode 100644
index 0000000000..8a37e72a89
--- /dev/null
+++ b/pl_bolts/models/detection/yolo/types.py
@@ -0,0 +1,7 @@
+from typing import Any, Dict, List, Tuple
+
+from torch import Tensor
+
+TARGET = Dict[str, Any]
+TARGETS = List[TARGET]
+NETWORK_OUTPUT = Tuple[List[Tensor], List[Tensor], List[int]]  # detections, losses, hits
diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
index 2ceb1313bb..eb762aa4d0 100644
--- a/pl_bolts/models/detection/yolo/utils.py
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -16,12 +16,12 @@
 # a "@torch.jit.script" function, it's difficult to make this decision at call time.
 if version.parse(torch.__version__) >= version.parse("1.10.0"):
 
-    def meshgrid(x, y):
-        return torch.meshgrid((x, y), indexing="ij")
+    def meshgrid(x: Tensor, y: Tensor) -> List[Tensor]:
+        return torch.meshgrid((x, y), indexing="ij")  # type: ignore
 
 
 else:
-    meshgrid = torch.meshgrid
+    meshgrid = torch.meshgrid  # type: ignore
 
 
 def grid_offsets(grid_size: Tensor) -> Tensor:
@@ -33,8 +33,8 @@ def grid_offsets(grid_size: Tensor) -> Tensor:
     Returns:
         A ``[height, width, 2]`` tensor containing the grid cell `(x, y)` offsets.
     """
-    x_range = torch.arange(grid_size[0], device=grid_size.device)
-    y_range = torch.arange(grid_size[1], device=grid_size.device)
+    x_range = torch.arange(grid_size[0].item(), device=grid_size.device)
+    y_range = torch.arange(grid_size[1].item(), device=grid_size.device)
     grid_y, grid_x = meshgrid(y_range, x_range)
     return torch.stack((grid_x, grid_y), -1)
 
@@ -99,7 +99,7 @@ def aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
     return inter / union
 
 
-def iou_below(pred_boxes: Tensor, target_boxes: Tensor, threshold: float) -> List[Tensor]:
+def iou_below(pred_boxes: Tensor, target_boxes: Tensor, threshold: float) -> Tensor:
     """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any target
     significantly (IoU greater than ``threshold``).
 
@@ -119,7 +119,7 @@ def iou_below(pred_boxes: Tensor, target_boxes: Tensor, threshold: float) -> Lis
     return below_threshold.view(shape)
 
 
-def is_inside_box(points, boxes):
+def is_inside_box(points: Tensor, boxes: Tensor) -> Tensor:
     """Get pairwise truth values of whether the point is inside the box.
 
     Args:
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 78c4167fa3..37d258ecbb 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -1,15 +1,17 @@
 from copy import copy
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
 from pytorch_lightning import LightningModule
 from pytorch_lightning.utilities.cli import LightningCLI
+from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
 from torch import Tensor, optim
 
 from pl_bolts.datamodules import VOCDetectionDataModule
 from pl_bolts.datamodules.vocdetection_datamodule import Compose
 from pl_bolts.models.detection.yolo.darknet_network import DarknetNetwork
+from pl_bolts.models.detection.yolo.types import TARGET, TARGETS
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
 from pl_bolts.utils import _TORCHMETRICS_DETECTION_AVAILABLE, _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
@@ -87,7 +89,7 @@ class YOLO(LightningModule):
 
     def __init__(
         self,
-        network: nn.ModuleList,
+        network: nn.Module,
         optimizer: Type[optim.Optimizer] = optim.SGD,
         optimizer_params: Dict[str, Any] = {"lr": 0.01, "momentum": 0.9, "weight_decay": 0.0005},
         lr_scheduler: Type[optim.lr_scheduler._LRScheduler] = LinearWarmupCosineAnnealingLR,
@@ -114,7 +116,9 @@ def __init__(
             self._val_map = MAP(compute_on_step=False)
             self._test_map = MAP(compute_on_step=False)
 
-    def forward(self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Tensor, Tensor]:
+    def forward(  # type: ignore
+        self, images: Tensor, targets: Optional[TARGETS] = None
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
         """Runs a forward pass through the network (all layers listed in ``self.network``), and if training targets
         are provided, computes the losses from the detection layers.
 
@@ -142,7 +146,7 @@ def forward(self, images: Tensor, targets: Optional[List[Dict[str, Tensor]]] = N
 
         total_hits = sum(hits)
         for layer_idx, layer_hits in enumerate(hits):
-            hit_rate = torch.true_divide(layer_hits, total_hits) if total_hits > 0 else 1.0
+            hit_rate: Union[Tensor, float] = torch.true_divide(layer_hits, total_hits) if total_hits > 0 else 1.0
             self.log(f"layer_{layer_idx}_hit_rate", hit_rate, sync_dist=False)
 
         losses = torch.stack(losses).sum(0)
@@ -176,7 +180,7 @@ def configure_optimizers(self) -> Tuple[List, List]:
         lr_scheduler = self.lr_scheduler_class(optimizer, **self.lr_scheduler_params)
         return [optimizer], [lr_scheduler]
 
-    def training_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_idx: int) -> Dict[str, Tensor]:
+    def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> STEP_OUTPUT:  # type: ignore
         """Computes the training loss.
 
         Args:
@@ -199,7 +203,9 @@ def training_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], bat
 
         return {"loss": losses.sum()}
 
-    def validation_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_idx: int):
+    def validation_step(  # type: ignore
+        self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int
+    ) -> Optional[STEP_OUTPUT]:
         """Evaluates a batch of data from the validation set.
 
         Args:
@@ -220,14 +226,14 @@ def validation_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], b
             targets = self.process_targets(targets)
             self._val_map(detections, targets)
 
-    def validation_epoch_end(self, outputs):
+    def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
         if _TORCHMETRICS_DETECTION_AVAILABLE:
             map_scores = self._val_map.compute()
             map_scores = {"val/" + k: v for k, v in map_scores.items()}
             self.log_dict(map_scores, sync_dist=True)
             self._val_map.reset()
 
-    def test_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_idx: int):
+    def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Optional[STEP_OUTPUT]:  # type: ignore
         """Evaluates a batch of data from the test set.
 
         Args:
@@ -248,7 +254,7 @@ def test_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_i
             targets = self.process_targets(targets)
             self._test_map(detections, targets)
 
-    def test_epoch_end(self, outputs):
+    def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
         if _TORCHMETRICS_DETECTION_AVAILABLE:
             map_scores = self._test_map.compute()
             map_scores = {"test/" + k: v for k, v in map_scores.items()}
@@ -288,7 +294,6 @@ def process_detections(self, preds: Tensor) -> List[Dict[str, Tensor]]:
         have an IoU greater than the NMS threshold with a higher scoring box.
 
         The returned detections are sorted by descending confidence. The items of the dictionaries are as follows:
-
         - boxes (``Tensor[batch_size, N, 4]``): detected bounding box `(x1, y1, x2, y2)` coordinates
         - scores (``Tensor[batch_size, N]``): detection confidences
         - labels (``Int64Tensor[batch_size, N]``): the predicted class IDs
@@ -322,7 +327,7 @@ def process_detections(self, preds: Tensor) -> List[Dict[str, Tensor]]:
 
         return result
 
-    def process_targets(self, targets: List[Dict[str, Tensor]]) -> List[Dict[str, Tensor]]:
+    def process_targets(self, targets: TARGETS) -> TARGETS:
         """Duplicates multi-label targets to create one target for each label.
 
         Args:
@@ -344,9 +349,7 @@ def process_targets(self, targets: List[Dict[str, Tensor]]) -> List[Dict[str, Te
 
         return result
 
-    def _validate_batch(
-        self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]]
-    ) -> Tuple[Tensor, List[Dict[str, Tensor]]]:
+    def _validate_batch(self, batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS]:
         """Reads a batch of data, validates the format, and stacks the images into a single tensor.
 
         Args:
@@ -429,7 +432,7 @@ def __init__(
         overlap_loss_multiplier: Optional[float] = None,
         class_loss_multiplier: Optional[float] = None,
         confidence_loss_multiplier: Optional[float] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         network = DarknetNetwork(
             network_config,
@@ -456,7 +459,7 @@ class ResizedVOCDetectionDataModule(VOCDetectionDataModule):
         height: Resize images to this height.
     """
 
-    def __init__(self, width: int = 608, height: int = 608, **kwargs):
+    def __init__(self, width: int = 608, height: int = 608, **kwargs: Any):
         super().__init__(**kwargs)
         self.image_size = (height, width)
 
@@ -474,7 +477,7 @@ def default_transforms(self) -> Callable:
             ]
         return Compose(transforms)
 
-    def _resize(self, image: Tensor, target: Dict[str, Any]):
+    def _resize(self, image: Tensor, target: TARGET) -> Tuple[Tensor, TARGET]:
         """Rescales the image and target to ``self.image_size``.
 
         Args:

From 2737fa4e95b21393a75c8edfefd16e9c624cfa35 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 14 Apr 2022 13:13:44 +0200
Subject: [PATCH 09/76] Updated CHANGELOG.

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b16b2d5053..540e1e105c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- Improved YOLO model includes YOLOv4, YOLOv5, and YOLOX networks and training algorithms ([#552](https://github.com/PyTorchLightning/pytorch-lightning-bolts/pull/817))
+
 
 ### Deprecated
 

From 26987eb3a08b8048f5f4f31978e479fc70da3b64 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 14 Apr 2022 15:08:58 +0200
Subject: [PATCH 10/76] Torchvision import made conditional

---
 pl_bolts/models/detection/yolo/layers.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index 82b80eb9bc..17569b12cd 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -3,7 +3,6 @@
 import torch
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import Tensor, nn
-from torchvision.ops import box_convert
 
 from pl_bolts.models.detection.yolo.loss import LossFunction
 from pl_bolts.models.detection.yolo.target_matching import (
@@ -15,6 +14,12 @@
 )
 from pl_bolts.models.detection.yolo.utils import global_xy
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
+from pl_bolts.utils.warnings import warn_missing_pkg
+
+if _TORCHVISION_AVAILABLE:
+    from torchvision.ops import box_convert
+else:  # pragma: no cover
+    warn_missing_pkg("torchvision")
 
 
 def _get_padding(kernel_size: int, stride: int) -> Tuple[int, nn.Module]:

From 09bce808f15f0e6e71828ffe58653b25f54b864a Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 28 Apr 2022 12:10:15 +0300
Subject: [PATCH 11/76] Use expand() instead of broadcast_to() for backward
 compatibility

---
 .../models/detection/yolo/darknet_network.py  |  2 +-
 pl_bolts/models/detection/yolo/loss.py        | 20 ++++++++++++-------
 pl_bolts/models/detection/yolo/utils.py       |  1 -
 pl_bolts/models/detection/yolo/yolo_module.py |  6 +++---
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index fa9897d145..4e25a6fccd 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -6,8 +6,8 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_info
 from torch import Tensor
 
 from pl_bolts.models.detection.yolo import layers
diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index c5316c02d4..9b4f3ceaf1 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -2,9 +2,17 @@
 from typing import Callable, Dict, Optional, Tuple, Union
 
 import torch
+from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 from torch import Tensor
 from torch.nn.functional import binary_cross_entropy, binary_cross_entropy_with_logits
-from torchvision.ops import box_iou, generalized_box_iou
+
+from pl_bolts.utils import _TORCHVISION_AVAILABLE
+from pl_bolts.utils.warnings import warn_missing_pkg
+
+if _TORCHVISION_AVAILABLE:
+    from torchvision.ops import box_iou, generalized_box_iou
+else:
+    warn_missing_pkg("torchvision")
 
 
 def _upcast(t: Tensor) -> Tensor:
@@ -29,8 +37,8 @@ def complete_iou(boxes1: Tensor, boxes2: Tensor, distance_only: bool = False) ->
     """
 
     # Degenerate boxes give inf / nan results, so do an early check.
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    if not ((boxes1[:, 2:] >= boxes1[:, :2]).all() and (boxes2[:, 2:] >= boxes2[:, :2]).all()):
+        rank_zero_warn("Some boxes have negative width or height, or the coordinates contain infinite or NaN values.")
 
     iou = box_iou(boxes1, boxes2)
 
@@ -161,8 +169,7 @@ def _calculate_confidence(self, preds: Tensor, overlap: Tensor, bce_func: Callab
         if self.predict_overlap is not None:
             # When predicting overlap, target confidence is different for each pair of a prediction and a target. The
             # tensors have to be broadcasted to [M, N].
-            preds = preds.unsqueeze(0)
-            preds = torch.broadcast_to(preds, overlap.shape)
+            preds = preds.unsqueeze(0).expand(overlap.shape)
             targets = torch.ones_like(preds) - self.predict_overlap
             # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
             targets = targets + (self.predict_overlap * overlap.detach().clamp(min=0))
@@ -174,8 +181,7 @@ def _calculate_confidence(self, preds: Tensor, overlap: Tensor, bce_func: Callab
         if result.ndim == 1:
             # When not predicting overlap, target confidence is the same for every target, but we should still return a
             # matrix.
-            result = result.unsqueeze(0)
-            torch.broadcast_to(result, overlap.shape)
+            result = result.unsqueeze(0).expand(overlap.shape)
 
         return result
 
diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
index eb762aa4d0..470fba976b 100644
--- a/pl_bolts/models/detection/yolo/utils.py
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -19,7 +19,6 @@
     def meshgrid(x: Tensor, y: Tensor) -> List[Tensor]:
         return torch.meshgrid((x, y), indexing="ij")  # type: ignore
 
-
 else:
     meshgrid = torch.meshgrid  # type: ignore
 
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 37d258ecbb..88a6327d5e 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -17,7 +17,7 @@
 from pl_bolts.utils.warnings import warn_missing_pkg
 
 if _TORCHMETRICS_DETECTION_AVAILABLE:
-    from torchmetrics.detection.map import MAP
+    from torchmetrics.detection import MeanAveragePrecision
 
 if _TORCHVISION_AVAILABLE:
     from torchvision.ops import batched_nms
@@ -113,8 +113,8 @@ def __init__(
         self.detections_per_image = detections_per_image
 
         if _TORCHMETRICS_DETECTION_AVAILABLE:
-            self._val_map = MAP(compute_on_step=False)
-            self._test_map = MAP(compute_on_step=False)
+            self._val_map = MeanAveragePrecision(compute_on_step=False)
+            self._test_map = MeanAveragePrecision(compute_on_step=False)
 
     def forward(  # type: ignore
         self, images: Tensor, targets: Optional[TARGETS] = None

From a80536eaeb7f6ab6df997786df82d72f76d193fd Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 28 Apr 2022 12:27:54 +0300
Subject: [PATCH 12/76] Use pytorch_lightning.utilities.distributed if
 pytorch_lightning.utilities.rank_zero is not available

---
 pl_bolts/models/detection/yolo/darknet_network.py | 7 ++++++-
 pl_bolts/models/detection/yolo/loss.py            | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index 4e25a6fccd..cc53807ab0 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -7,7 +7,12 @@
 import torch
 import torch.nn as nn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_info
+
+try:
+    from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_info
+except ModuleNotFoundError:
+    from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info
+
 from torch import Tensor
 
 from pl_bolts.models.detection.yolo import layers
diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index 9b4f3ceaf1..7fef0ecff0 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -2,7 +2,12 @@
 from typing import Callable, Dict, Optional, Tuple, Union
 
 import torch
-from pytorch_lightning.utilities.rank_zero import rank_zero_warn
+
+try:
+    from pytorch_lightning.utilities.rank_zero import rank_zero_warn
+except ModuleNotFoundError:
+    from pytorch_lightning.utilities.distributed import rank_zero_warn
+
 from torch import Tensor
 from torch.nn.functional import binary_cross_entropy, binary_cross_entropy_with_logits
 

From b1b8db3a5e68318a7e1eee35384dcf6bd0a355da Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 8 Jun 2022 09:07:02 +0300
Subject: [PATCH 13/76] YOLOV4P6 network architecture

* Added the deeper P6 variant of YOLOv4.
* Width of each backbone stage can be configured freely.
* The number of input channels in the PyTorch architectures can be changed.
* Added predict_step().
---
 pl_bolts/models/detection/__init__.py         |   2 +
 .../models/detection/yolo/darknet_network.py  |  19 +-
 .../models/detection/yolo/torch_networks.py   | 476 ++++++++++++------
 pl_bolts/models/detection/yolo/yolo_module.py |  66 ++-
 tests/models/test_detection.py                |  24 +-
 5 files changed, 414 insertions(+), 173 deletions(-)

diff --git a/pl_bolts/models/detection/__init__.py b/pl_bolts/models/detection/__init__.py
index 7d8615c089..aa90b09437 100644
--- a/pl_bolts/models/detection/__init__.py
+++ b/pl_bolts/models/detection/__init__.py
@@ -5,6 +5,7 @@
 from pl_bolts.models.detection.yolo.torch_networks import (
     YOLOV4Backbone,
     YOLOV4Network,
+    YOLOV4P6Network,
     YOLOV4TinyBackbone,
     YOLOV4TinyNetwork,
     YOLOV5Backbone,
@@ -20,6 +21,7 @@
     "DarknetNetwork",
     "YOLOV4Backbone",
     "YOLOV4Network",
+    "YOLOV4P6Network",
     "YOLOV4TinyBackbone",
     "YOLOV4TinyNetwork",
     "YOLOV5Backbone",
diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index cc53807ab0..2788122742 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -28,7 +28,9 @@
 class DarknetNetwork(nn.Module):
     """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation."""
 
-    def __init__(self, config_path: str, weights_path: Optional[str] = None, **kwargs: Any) -> None:
+    def __init__(
+        self, config_path: str, weights_path: Optional[str] = None, in_channels: Optional[int] = None, **kwargs: Any
+    ) -> None:
         """Parses a Darknet configuration file and creates the network structure.
 
         Iterates through the layers from the configuration and creates corresponding PyTorch modules. If
@@ -38,6 +40,7 @@ def __init__(self, config_path: str, weights_path: Optional[str] = None, **kwarg
         Args:
             config_path: Path to a Darknet configuration file that defines the network architecture.
             weights_path: Path to a Darknet model file. If given, the model weights will be read from this file.
+            in_channels: Number of channels in the input image.
             match_sim_ota: If ``True``, matches a target to an anchor using the SimOTA algorithm from YOLOX.
             match_size_ratio: If specified, matches a target to an anchor if its width and height relative to the anchor
                 is smaller than this ratio. If ``match_size_ratio`` or ``match_iou_threshold`` is not specified, selects
@@ -67,10 +70,14 @@ def __init__(self, config_path: str, weights_path: Optional[str] = None, **kwarg
         global_config = sections[0]
         layer_configs = sections[1:]
 
+        if in_channels is None:
+            in_channels = global_config.get("channels", 3)
+            assert isinstance(in_channels, int)
+
         self.layers = nn.ModuleList()
         # num_inputs will contain the number of channels in the input of every layer up to the current layer. It is
         # initialized with the number of channels in the input image.
-        num_inputs = [global_config.get("channels", 3)]
+        num_inputs = [in_channels]
         for layer_config in layer_configs:
             config = {**global_config, **layer_config}
             layer, num_outputs = _create_layer(config, num_inputs, **kwargs)
@@ -132,12 +139,12 @@ def read(tensor: Tensor) -> int:
 
             Returns the number of elements read. If there's no more data in ``weight_file``, returns 0.
             """
-            x = np.fromfile(weight_file, count=tensor.numel(), dtype=np.float32)
-            num_elements = x.size
+            np_array = np.fromfile(weight_file, count=tensor.numel(), dtype=np.float32)
+            num_elements = np_array.size
             if num_elements > 0:
-                x = torch.from_numpy(x).view_as(tensor)
+                source = torch.from_numpy(np_array).view_as(tensor)
                 with torch.no_grad():
-                    tensor.copy_(x)
+                    tensor.copy_(source)
             return num_elements
 
         for layer_idx, layer in enumerate(self.layers):
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index 9e63d2c94d..3b0e8b3498 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -82,7 +82,9 @@ def forward(self, x: Tensor) -> Tensor:
 class CSPBlock(nn.Module):
     """One stage of a Cross Stage Partial Network (CSPNet).
 
-    Encapsulates a number of bottleneck blocks in the CSP structure.
+    Encapsulates a number of bottleneck blocks in the "fusion first" CSP structure.
+
+    `Chien-Yao Wang et al. <https://openaccess.thecvf.com/content_CVPRW_2020/html/w28/Wang_CSPNet_A_New_Backbone_That_Can_Enhance_Learning_Capability_of_CVPRW_2020_paper.html>`_
 
     Args:
         in_channels: Number of input channels that the CSP block expects.
@@ -108,9 +110,10 @@ def __init__(
         # Instead of splitting the N output channels of a convolution into two parts, we can equivalently perform two
         # convolutions with N/2 output channels.
         hidden_channels = out_channels // 2
+
         self.split1 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
         self.split2 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
-        bottlenecks = [
+        bottlenecks: List[nn.Module] = [
             BottleneckBlock(hidden_channels, hidden_channels, shortcut=shortcut, norm=norm, activation=activation)
             for _ in range(depth)
         ]
@@ -161,8 +164,9 @@ class YOLOV4TinyBackbone(nn.Module):
     """Backbone of the "tiny" network architecture from YOLOv4.
 
     Args:
-        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
-            number of channels that is a multiple of this value.
+        in_channels: Number of channels in the input image.
+        width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
+            of channels that is a multiple of this value.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -170,6 +174,7 @@ class YOLOV4TinyBackbone(nn.Module):
 
     def __init__(
         self,
+        in_channels: int = 3,
         width: int = 32,
         activation: Optional[str] = "leaky",
         normalization: Optional[str] = "batchnorm",
@@ -194,7 +199,7 @@ def maxpool(out_channels: int) -> nn.Module:
                 )
             )
 
-        self.stage1 = Conv(3, width, kernel_size=3, stride=2, activation=activation, norm=normalization)
+        self.stage1 = Conv(in_channels, width, kernel_size=3, stride=2, activation=activation, norm=normalization)
         self.downsample2 = downsample(width, width * 2)
         self.stage2 = TinyBlock(width * 2, activation=activation, norm=normalization)
         self.downsample3 = maxpool(width * 4)
@@ -203,7 +208,7 @@ def maxpool(out_channels: int) -> nn.Module:
         self.stage4 = TinyBlock(width * 8, activation=activation, norm=normalization)
         self.downsample5 = maxpool(width * 16)
 
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    def forward(self, x: Tensor) -> List[Tensor]:
         c1 = self.stage1(x)
         x = self.downsample2(c1)
         c2 = self.stage2(x)
@@ -215,15 +220,16 @@ def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
         c4 = self.stage4(x)
         x = torch.cat((x, c4), dim=1)
         c5 = self.downsample5(x)
-        return c1, c2, c3, c4, c5
+        return [c1, c2, c3, c4, c5]
 
 
 class YOLOV4Backbone(nn.Module):
     """A backbone that approximately corresponds to the Cross Stage Partial Network from YOLOv4.
 
     Args:
-        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
-            number of channels that is a multiple of this value.
+        in_channels: Number of channels in the input image.
+        widths: Number of channels at each network stage.
+        depths: Number of bottleneck layers at each network stage.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -231,80 +237,62 @@ class YOLOV4Backbone(nn.Module):
 
     def __init__(
         self,
-        width: int = 32,
+        in_channels: int = 3,
+        widths: Sequence[int] = (32, 64, 128, 256, 512, 1024),
+        depths: Sequence[int] = (1, 1, 2, 8, 8, 4),
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
     ) -> None:
         super().__init__()
 
-        def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+        if len(widths) != len(depths):
+            raise ValueError("Width and depth has to be given for an equal number of stages.")
 
-        def csp(num_channels: int, depth: int) -> nn.Module:
-            return CSPBlock(num_channels, num_channels, depth=depth)
+        def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
 
-        def spp(num_channels: int) -> nn.Module:
-            return FastSPP(num_channels, num_channels, kernel_size=5, activation=activation, norm=normalization)
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
-        self.stage1 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("stem", Conv(3, width, kernel_size=3, stride=1, activation=activation, norm=normalization)),
-                    ("downsample", downsample(width, width * 2)),
-                    ("csp", csp(width * 2, 1)),
-                ]
-            )
-        )
-        self.stage2 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("downsample", downsample(width * 2, width * 4)),
-                    ("csp", csp(width * 4, 2)),
-                ]
-            )
-        )
-        self.stage3 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("downsample", downsample(width * 4, width * 8)),
-                    ("csp", csp(width * 8, 8)),
-                ]
-            )
-        )
-        self.stage4 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("downsample", downsample(width * 8, width * 16)),
-                    ("csp", csp(width * 16, 8)),
-                ]
+        def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
+            csp = CSPBlock(
+                out_channels,
+                out_channels,
+                depth=depth,
+                shortcut=True,
+                activation=activation,
+                norm=normalization,
             )
-        )
-        self.stage5 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("downsample", downsample(width * 16, width * 32)),
-                    ("csp", csp(width * 32, 4)),
-                    ("spp", spp(width * 32)),
-                ]
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("downsample", downsample(in_channels, out_channels)),
+                        ("csp", csp),
+                    ]
+                )
             )
-        )
 
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
-        c1 = self.stage1(x)
-        c2 = self.stage2(c1)
-        c3 = self.stage3(c2)
-        c4 = self.stage4(c3)
-        c5 = self.stage5(c4)
-        return c1, c2, c3, c4, c5
+        convs = [conv3x3(in_channels, widths[0])] + [conv3x3(widths[0], widths[0]) for _ in range(depths[0] - 1)]
+        self.stem = nn.Sequential(*convs)
+        self.stages = nn.ModuleList(stage(widths[n], widths[n + 1], depth) for n, depth in enumerate(depths[:-1]))
+
+    def forward(self, x: Tensor) -> List[Tensor]:
+        x = self.stem(x)
+        outputs: List[Tensor] = []
+        for stage in self.stages:
+            x = stage(x)
+            outputs.append(x)
+        return outputs
 
 
 class YOLOV5Backbone(nn.Module):
     """The Cross Stage Partial Network backbone from YOLOv5.
 
     Args:
+        in_channels: Number of channels in the input image.
+        width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
+            of channels that is a multiple of this value.
         depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
-        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
-            number of channels that is a multiple of this value.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -312,8 +300,9 @@ class YOLOV5Backbone(nn.Module):
 
     def __init__(
         self,
-        depth: int = 3,
+        in_channels: int = 3,
         width: int = 64,
+        depth: int = 3,
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
     ) -> None:
@@ -324,54 +313,37 @@ def downsample(in_channels: int, out_channels: int, kernel_size: int = 3) -> nn.
                 in_channels, out_channels, kernel_size=kernel_size, stride=2, activation=activation, norm=normalization
             )
 
-        def csp(num_channels: int, depth: int) -> nn.Module:
-            return CSPBlock(num_channels, num_channels, depth=depth)
-
-        def spp(num_channels: int) -> nn.Module:
-            return FastSPP(num_channels, num_channels, kernel_size=5, activation=activation, norm=normalization)
-
-        self.stage1 = downsample(3, width, kernel_size=6)
-        self.stage2 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("downsample", downsample(width, width * 2)),
-                    ("csp", csp(width * 2, depth)),
-                ]
-            )
-        )
-        self.stage3 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("downsample", downsample(width * 2, width * 4)),
-                    ("csp", csp(width * 4, depth * 2)),
-                ]
-            )
-        )
-        self.stage4 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("downsample", downsample(width * 4, width * 8)),
-                    ("csp", csp(width * 8, depth * 3)),
-                ]
+        def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
+            csp = CSPBlock(
+                out_channels,
+                out_channels,
+                depth=depth,
+                shortcut=True,
+                activation=activation,
+                norm=normalization,
             )
-        )
-        self.stage5 = nn.Sequential(
-            OrderedDict(
-                [
-                    ("downsample", downsample(width * 8, width * 16)),
-                    ("csp", csp(width * 16, depth)),
-                    ("spp", spp(width * 16)),
-                ]
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("downsample", downsample(in_channels, out_channels)),
+                        ("csp", csp),
+                    ]
+                )
             )
-        )
 
-    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+        self.stage1 = downsample(in_channels, width, kernel_size=6)
+        self.stage2 = stage(width, width * 2, depth)
+        self.stage3 = stage(width * 2, width * 4, depth * 2)
+        self.stage4 = stage(width * 4, width * 8, depth * 3)
+        self.stage5 = stage(width * 8, width * 16, depth)
+
+    def forward(self, x: Tensor) -> List[Tensor]:
         c1 = self.stage1(x)
         c2 = self.stage2(c1)
         c3 = self.stage3(c2)
         c4 = self.stage4(c3)
         c5 = self.stage5(c4)
-        return c1, c2, c3, c4, c5
+        return [c1, c2, c3, c4, c5]
 
 
 class YOLOV4TinyNetwork(nn.Module):
@@ -526,8 +498,7 @@ class YOLOV4Network(nn.Module):
     Args:
         num_classes: Number of different classes that this model predicts.
         backbone: A backbone network that returns the output from each stage.
-        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
-            number of channels that is a multiple of this value.
+        widths: Number of channels at each network stage.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -561,7 +532,7 @@ def __init__(
         self,
         num_classes: int,
         backbone: Optional[nn.Module] = None,
-        width: int = 32,
+        widths: Sequence[int] = (32, 64, 128, 256, 512, 1024),
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
         prior_shapes: List[Tuple[int, int]] = None,
@@ -589,6 +560,9 @@ def __init__(
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
         num_outputs = (5 + num_classes) * anchors_per_cell
 
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return FastSPP(in_channels, out_channels, kernel_size=5, activation=activation, norm=normalization)
+
         def conv(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
 
@@ -602,9 +576,9 @@ def csp(in_channels: int, out_channels: int) -> nn.Module:
                 activation=activation,
             )
 
-        def out(num_channels: int) -> nn.Module:
-            conv = Conv(num_channels, num_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
-            outputs = nn.Conv2d(num_channels, num_outputs, kernel_size=1)
+        def out(in_channels: int) -> nn.Module:
+            conv = Conv(in_channels, in_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
             return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
 
         def upsample(in_channels: int, out_channels: int) -> nn.Module:
@@ -621,25 +595,34 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
 
-        self.backbone = backbone or YOLOV4Backbone(width=width, normalization=normalization, activation=activation)
+        if backbone is not None:
+            self.backbone = backbone
+        else:
+            self.backbone = YOLOV4Backbone(widths=widths, normalization=normalization, activation=activation)
+
+        w3 = widths[-3]
+        w4 = widths[-2]
+        w5 = widths[-1]
 
-        self.pre3 = conv(width * 8, width * 4)
-        self.fpn3 = csp(width * 8, width * 8)
-        self.out3 = out(width * 8)
+        self.spp = spp(w5, w5)
 
-        self.pre4 = conv(width * 16, width * 8)
-        self.fpn4 = csp(width * 16, width * 16)
-        self.pan4 = csp(width * 24, width * 16)
-        self.out4 = out(width * 16)
+        self.pre4 = conv(w4, w4 // 2)
+        self.upsample5 = upsample(w5, w4 // 2)
+        self.fpn4 = csp(w4, w4)
 
-        self.pan5 = csp(width * 48, width * 32)
-        self.out5 = out(width * 32)
+        self.pre3 = conv(w3, w3 // 2)
+        self.upsample4 = upsample(w4, w3 // 2)
+        self.fpn3 = csp(w3, w3)
 
-        self.upsample4 = upsample(width * 16, width * 4)
-        self.upsample5 = upsample(width * 32, width * 8)
+        self.downsample3 = downsample(w3, w3)
+        self.pan4 = csp(w3 + w4, w4)
 
-        self.downsample3 = downsample(width * 8, width * 8)
-        self.downsample4 = downsample(width * 16, width * 16)
+        self.downsample4 = downsample(w4, w4)
+        self.pan5 = csp(w4 + w5, w5)
+
+        self.out3 = out(w3)
+        self.out4 = out(w4)
+        self.out5 = out(w5)
 
         self.detect3 = detect(range(0, anchors_per_cell))
         self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
@@ -652,19 +635,15 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
         image_size = get_image_size(x)
 
-        c3, c4, c5 = self.backbone(x)[-3:]
+        c3, c4, x = self.backbone(x)[-3:]
+        c5 = self.spp(x)
 
-        x = self.pre4(c4)
-        x = torch.cat((x, self.upsample5(c5)), dim=1)
+        x = torch.cat((self.upsample5(c5), self.pre4(c4)), dim=1)
         p4 = self.fpn4(x)
-
-        x = self.pre3(c3)
-        x = torch.cat((x, self.upsample4(p4)), dim=1)
+        x = torch.cat((self.upsample4(p4), self.pre3(c3)), dim=1)
         n3 = self.fpn3(x)
-
         x = torch.cat((self.downsample3(n3), p4), dim=1)
         n4 = self.pan4(x)
-
         x = torch.cat((self.downsample4(n4), c5), dim=1)
         n5 = self.pan5(x)
 
@@ -689,6 +668,211 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         return detections, losses, hits
 
 
+class YOLOV4P6Network(nn.Module):
+    """Network architecture that corresponds approximately to the variant of YOLOv4 with four detection layers.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        widths: Number of channels at each network stage.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain (width, height) tuples in the network input
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per spatial location. They
+            are assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning
+            that you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
+            "iou", "giou", "diou", and "ciou" (default).
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        widths: Sequence[int] = (32, 64, 128, 256, 512, 1024, 1024),
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: List[Tuple[int, int]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                (13, 17),
+                (31, 25),
+                (24, 51),
+                (61, 45),
+                (61, 45),
+                (48, 102),
+                (119, 96),
+                (97, 189),
+                (97, 189),
+                (217, 184),
+                (171, 384),
+                (324, 451),
+                (324, 451),
+                (545, 357),
+                (616, 618),
+                (1024, 1024),
+            ]
+            anchors_per_cell = 4
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 4)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 4.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return FastSPP(in_channels, out_channels, kernel_size=5, activation=activation, norm=normalization)
+
+        def conv(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def csp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPBlock(
+                in_channels,
+                out_channels,
+                depth=2,
+                shortcut=False,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def out(in_channels: int) -> nn.Module:
+            conv = Conv(in_channels, in_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
+            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+
+        def upsample(in_channels: int, out_channels: int) -> nn.Module:
+            channels = conv(in_channels, out_channels)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
+            assert prior_shapes is not None
+            return create_detection_layer(
+                prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
+            )
+
+        if backbone is not None:
+            self.backbone = backbone
+        else:
+            self.backbone = YOLOV4Backbone(
+                widths=widths, depths=(1, 1, 3, 15, 15, 7, 7), normalization=normalization, activation=activation
+            )
+
+        w3 = widths[-4]
+        w4 = widths[-3]
+        w5 = widths[-2]
+        w6 = widths[-1]
+
+        self.spp = spp(w6, w6)
+
+        self.pre5 = conv(w5, w5 // 2)
+        self.upsample6 = upsample(w6, w5 // 2)
+        self.fpn5 = csp(w5, w5)
+
+        self.pre4 = conv(w4, w4 // 2)
+        self.upsample5 = upsample(w5, w4 // 2)
+        self.fpn4 = csp(w4, w4)
+
+        self.pre3 = conv(w3, w3 // 2)
+        self.upsample4 = upsample(w4, w3 // 2)
+        self.fpn3 = csp(w3, w3)
+
+        self.downsample3 = downsample(w3, w3)
+        self.pan4 = csp(w3 + w4, w4)
+
+        self.downsample4 = downsample(w4, w4)
+        self.pan5 = csp(w4 + w5, w5)
+
+        self.downsample5 = downsample(w5, w5)
+        self.pan6 = csp(w5 + w6, w6)
+
+        self.out3 = out(w3)
+        self.out4 = out(w4)
+        self.out5 = out(w5)
+        self.out6 = out(w6)
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+        self.detect6 = detect(range(anchors_per_cell * 3, anchors_per_cell * 4))
+
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        detections = []  # Outputs from detection layers
+        losses = []  # Losses from detection layers
+        hits = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5, x = self.backbone(x)[-4:]
+        c6 = self.spp(x)
+
+        x = torch.cat((self.upsample6(c6), self.pre5(c5)), dim=1)
+        p5 = self.fpn5(x)
+        x = torch.cat((self.upsample5(p5), self.pre4(c4)), dim=1)
+        p4 = self.fpn4(x)
+        x = torch.cat((self.upsample4(p4), self.pre3(c3)), dim=1)
+        n3 = self.fpn3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.pan4(x)
+        x = torch.cat((self.downsample4(n4), p5), dim=1)
+        n5 = self.pan5(x)
+        x = torch.cat((self.downsample5(n5), c6), dim=1)
+        n6 = self.pan6(x)
+
+        y = self.detect3(self.out3(n3), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect3.losses)
+            hits.append(self.detect3.hits)
+
+        y = self.detect4(self.out4(n4), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect4.losses)
+            hits.append(self.detect4.hits)
+
+        y = self.detect5(self.out5(n5), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect5.losses)
+            hits.append(self.detect5.hits)
+
+        y = self.detect6(self.out6(n6), image_size, targets)
+        detections.append(y)
+        if targets is not None:
+            losses.append(self.detect6.losses)
+            hits.append(self.detect6.hits)
+
+        return detections, losses, hits
+
+
 class YOLOV5Network(nn.Module):
     """The YOLOv5 network architecture. Different variants (n/s/m/l/x) can be achieved by adjusting the ``depth``
     and ``width`` parameters.
@@ -696,9 +880,9 @@ class YOLOV5Network(nn.Module):
     Args:
         num_classes: Number of different classes that this model predicts.
         backbone: A backbone network that returns the output from each stage.
-        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
         width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
             number of channels that is a multiple of this value.
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -732,8 +916,8 @@ def __init__(
         self,
         num_classes: int,
         backbone: Optional[nn.Module] = None,
-        depth: int = 3,
         width: int = 64,
+        depth: int = 3,
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
         prior_shapes: List[Tuple[int, int]] = None,
@@ -761,6 +945,9 @@ def __init__(
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
         num_outputs = (5 + num_classes) * anchors_per_cell
 
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return FastSPP(in_channels, out_channels, kernel_size=5, activation=activation, norm=normalization)
+
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
@@ -791,6 +978,8 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
             depth=depth, width=width, normalization=normalization, activation=activation
         )
 
+        self.spp = spp(width * 16, width * 16)
+
         self.pan3 = csp(width * 8, width * 4)
         self.out3 = out(width * 4)
 
@@ -825,7 +1014,8 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
         image_size = get_image_size(x)
 
-        c3, c4, c5 = self.backbone(x)[-3:]
+        c3, c4, x = self.backbone(x)[-3:]
+        c5 = self.spp(x)
 
         p5 = self.fpn5(c5)
         x = torch.cat((self.upsample(p5), c4), dim=1)
@@ -866,9 +1056,9 @@ class YOLOXNetwork(nn.Module):
     Args:
         num_classes: Number of different classes that this model predicts.
         backbone: A backbone network that returns the output from each stage.
-        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
         width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
             number of channels that is a multiple of this value.
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -902,8 +1092,8 @@ def __init__(
         self,
         num_classes: int,
         backbone: Optional[nn.Module] = None,
-        depth: int = 3,
         width: int = 64,
+        depth: int = 3,
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
         prior_shapes: List[Tuple[int, int]] = None,
@@ -920,6 +1110,9 @@ def __init__(
             if modulo != 0:
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
 
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return FastSPP(in_channels, out_channels, kernel_size=5, activation=activation, norm=normalization)
+
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
@@ -960,6 +1153,8 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
             depth=depth, width=width, normalization=normalization, activation=activation
         )
 
+        self.spp = spp(width * 16, width * 16)
+
         self.pan3 = csp(width * 8, width * 4)
         self.out3_stem = conv(width * 4, width * 4)
         self.out3_feat = features(width * 4)
@@ -1006,7 +1201,8 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
         image_size = get_image_size(x)
 
-        c3, c4, c5 = self.backbone(x)[-3:]
+        c3, c4, x = self.backbone(x)[-3:]
+        c5 = self.spp(x)
 
         p5 = self.fpn5(c5)
         x = torch.cat((self.upsample(p5), c4), dim=1)
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 88a6327d5e..f56be2e8de 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -17,7 +17,12 @@
 from pl_bolts.utils.warnings import warn_missing_pkg
 
 if _TORCHMETRICS_DETECTION_AVAILABLE:
-    from torchmetrics.detection import MeanAveragePrecision
+    try:
+        from torchmetrics.detection import MeanAveragePrecision  # type: ignore
+    except ImportError:
+        from torchmetrics.detection import MAP  # type: ignore
+
+        MeanAveragePrecision = MAP  # type: ignore
 
 if _TORCHVISION_AVAILABLE:
     from torchvision.ops import batched_nms
@@ -113,8 +118,8 @@ def __init__(
         self.detections_per_image = detections_per_image
 
         if _TORCHMETRICS_DETECTION_AVAILABLE:
-            self._val_map = MeanAveragePrecision(compute_on_step=False)
-            self._test_map = MeanAveragePrecision(compute_on_step=False)
+            self._val_map = MeanAveragePrecision()
+            self._test_map = MeanAveragePrecision()
 
     def forward(  # type: ignore
         self, images: Tensor, targets: Optional[TARGETS] = None
@@ -152,7 +157,7 @@ def forward(  # type: ignore
         losses = torch.stack(losses).sum(0)
         return detections, losses
 
-    def configure_optimizers(self) -> Tuple[List, List]:
+    def configure_optimizers(self) -> Tuple[List[optim.Optimizer], List[optim.lr_scheduler._LRScheduler]]:
         """Constructs the optimizer and learning rate scheduler based on ``self.optimizer_params`` and
         ``self.lr_scheduler_params``.
 
@@ -186,7 +191,7 @@ def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) ->
         Args:
             batch: A tuple of images and targets. Images is a list of 3-dimensional tensors. Targets is a list of target
                 dictionaries.
-            batch_idx: The index of this batch.
+            batch_idx: Index of the current batch.
 
         Returns:
             A dictionary that includes the training loss in 'loss'.
@@ -211,7 +216,7 @@ def validation_step(  # type: ignore
         Args:
             batch: A tuple of images and targets. Images is a list of 3-dimensional tensors. Targets is a list of target
                 dictionaries.
-            batch_idx: The index of this batch
+            batch_idx: Index of the current batch.
         """
         images, targets = self._validate_batch(batch)
         detections, losses = self(images, targets)
@@ -224,7 +229,7 @@ def validation_step(  # type: ignore
         if _TORCHMETRICS_DETECTION_AVAILABLE:
             detections = self.process_detections(detections)
             targets = self.process_targets(targets)
-            self._val_map(detections, targets)
+            self._val_map.update(detections, targets)
 
     def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
         if _TORCHMETRICS_DETECTION_AVAILABLE:
@@ -239,7 +244,7 @@ def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Opti
         Args:
             batch: A tuple of images and targets. Images is a list of 3-dimensional tensors. Targets is a list of target
                 dictionaries.
-            batch_idx: The index of this batch.
+            batch_idx: Index of the current batch.
         """
         images, targets = self._validate_batch(batch)
         detections, losses = self(images, targets)
@@ -252,7 +257,7 @@ def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Opti
         if _TORCHMETRICS_DETECTION_AVAILABLE:
             detections = self.process_detections(detections)
             targets = self.process_targets(targets)
-            self._test_map(detections, targets)
+            self._test_map.update(detections, targets)
 
     def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
         if _TORCHMETRICS_DETECTION_AVAILABLE:
@@ -261,6 +266,27 @@ def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> No
             self.log_dict(map_scores, sync_dist=True)
             self._test_map.reset()
 
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> List[Dict[str, Tensor]]:
+        """Feeds a batch of images to the network and returns the detected bounding boxes, confidence scores, and
+        class labels.
+
+        If a prediction has a high score for more than one class, it will be duplicated.
+
+        Args:
+            batch: A tuple of images and targets. Images is a list of 3-dimensional tensors. Targets is a list of target
+                dictionaries.
+            batch_idx: Index of the current batch.
+            dataloader_idx: Index of the current dataloader.
+
+        Returns:
+            A list of dictionaries containing tensors "boxes", "scores", and "labels". "boxes" is a matrix of detected
+            bounding box `(x1, y1, x2, y2)` coordinates. "scores" is a vector of confidence scores for the bounding box
+            detections. "labels" is a vector of predicted class labels.
+        """
+        images, _ = self._validate_batch(batch)
+        detections = self(images)
+        detections = self.process_detections(detections)
+
     def infer(self, image: Tensor) -> Dict[str, Tensor]:
         """Feeds an image to the network and returns the detected bounding boxes, confidence scores, and class
         labels.
@@ -304,12 +330,8 @@ def process_detections(self, preds: Tensor) -> List[Dict[str, Tensor]]:
         Returns:
             Filtered detections. A list of prediction dictionaries, one for each image.
         """
-        result = []
 
-        for image_preds in preds:
-            boxes = image_preds[..., :4]
-            confidences = image_preds[..., 4]
-            classprobs = image_preds[..., 5:]
+        def process(boxes: Tensor, confidences: Tensor, classprobs: Tensor) -> Dict[str, Any]:
             scores = classprobs * confidences[:, None]
 
             # Select predictions with high scores. If a prediction has a high score for more than one class, it will be
@@ -320,12 +342,9 @@ def process_detections(self, preds: Tensor) -> List[Dict[str, Tensor]]:
 
             keep = batched_nms(boxes, scores, labels, self.nms_threshold)
             keep = keep[: self.detections_per_image]
-            boxes = boxes[keep]
-            scores = scores[keep]
-            labels = labels[keep]
-            result.append({"boxes": boxes, "scores": scores, "labels": labels})
+            return {"boxes": boxes[keep], "scores": scores[keep], "labels": labels[keep]}
 
-        return result
+        return [process(p[..., :4], p[..., 4], p[..., 5:]) for p in preds]
 
     def process_targets(self, targets: TARGETS) -> TARGETS:
         """Duplicates multi-label targets to create one target for each label.
@@ -337,17 +356,14 @@ def process_targets(self, targets: TARGETS) -> TARGETS:
         Returns:
             Single-label targets. A list of target dictionaries, one for each image.
         """
-        result = []
 
-        for image_targets in targets:
-            boxes = image_targets["boxes"]
-            labels = image_targets["labels"]
+        def process(boxes: Tensor, labels: Tensor, **other: Any) -> Dict[str, Any]:
             if labels.ndim == 2:
                 idxs, labels = labels.nonzero().T
                 boxes = boxes[idxs]
-            result.append({"boxes": boxes, "labels": labels})
+            return {"boxes": boxes, "labels": labels, **other}
 
-        return result
+        return [process(**t) for t in targets]
 
     def _validate_batch(self, batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS]:
         """Reads a batch of data, validates the format, and stacks the images into a single tensor.
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index cd5217ebbd..5d3d2ef7ab 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -12,6 +12,7 @@
     FasterRCNN,
     RetinaNet,
     YOLOV4Network,
+    YOLOV4P6Network,
     YOLOV4TinyNetwork,
     YOLOV5Network,
     YOLOXNetwork,
@@ -254,7 +255,7 @@ def test_yolov4_tiny_train(tmpdir):
 
 
 def test_yolov4(tmpdir):
-    network = YOLOV4Network(num_classes=2, width=4)
+    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
     model = YOLO(network)
 
     image = torch.rand(1, 3, 256, 256)
@@ -262,7 +263,26 @@ def test_yolov4(tmpdir):
 
 
 def test_yolov4_train(tmpdir):
-    network = YOLOV4Network(num_classes=2, width=4)
+    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    model = YOLO(network)
+
+    train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+    valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
+
+
+def test_yolov4p6(tmpdir):
+    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
+    model = YOLO(network)
+
+    image = torch.rand(1, 3, 256, 256)
+    model(image)
+
+
+def test_yolov4p6_train(tmpdir):
+    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)

From 5661dba9e8c71d31c61b55062a6c0d500064e703 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 8 Jun 2022 09:45:58 +0300
Subject: [PATCH 14/76] Fixed document generation, when MeanAveragePrecision is
 not available

---
 pl_bolts/models/detection/yolo/yolo_module.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index f56be2e8de..d59f0582fd 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -19,10 +19,12 @@
 if _TORCHMETRICS_DETECTION_AVAILABLE:
     try:
         from torchmetrics.detection import MeanAveragePrecision  # type: ignore
-    except ImportError:
-        from torchmetrics.detection import MAP  # type: ignore
 
-        MeanAveragePrecision = MAP  # type: ignore
+        _MEAN_AVERAGE_PRECISION_AVAILABLE = True
+    except ImportError:
+        _MEAN_AVERAGE_PRECISION_AVAILABLE = False
+else:
+    _MEAN_AVERAGE_PRECISION_AVAILABLE = False
 
 if _TORCHVISION_AVAILABLE:
     from torchvision.ops import batched_nms
@@ -117,7 +119,7 @@ def __init__(
         self.nms_threshold = nms_threshold
         self.detections_per_image = detections_per_image
 
-        if _TORCHMETRICS_DETECTION_AVAILABLE:
+        if _MEAN_AVERAGE_PRECISION_AVAILABLE:
             self._val_map = MeanAveragePrecision()
             self._test_map = MeanAveragePrecision()
 
@@ -226,13 +228,13 @@ def validation_step(  # type: ignore
         self.log("val/class_loss", losses[2], sync_dist=True)
         self.log("val/total_loss", losses.sum(), sync_dist=True)
 
-        if _TORCHMETRICS_DETECTION_AVAILABLE:
+        if _MEAN_AVERAGE_PRECISION_AVAILABLE:
             detections = self.process_detections(detections)
             targets = self.process_targets(targets)
             self._val_map.update(detections, targets)
 
     def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
-        if _TORCHMETRICS_DETECTION_AVAILABLE:
+        if _MEAN_AVERAGE_PRECISION_AVAILABLE:
             map_scores = self._val_map.compute()
             map_scores = {"val/" + k: v for k, v in map_scores.items()}
             self.log_dict(map_scores, sync_dist=True)
@@ -254,13 +256,13 @@ def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Opti
         self.log("test/class_loss", losses[2], sync_dist=True)
         self.log("test/total_loss", losses.sum(), sync_dist=True)
 
-        if _TORCHMETRICS_DETECTION_AVAILABLE:
+        if _MEAN_AVERAGE_PRECISION_AVAILABLE:
             detections = self.process_detections(detections)
             targets = self.process_targets(targets)
             self._test_map.update(detections, targets)
 
     def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
-        if _TORCHMETRICS_DETECTION_AVAILABLE:
+        if _MEAN_AVERAGE_PRECISION_AVAILABLE:
             map_scores = self._test_map.compute()
             map_scores = {"test/" + k: v for k, v in map_scores.items()}
             self.log_dict(map_scores, sync_dist=True)

From 0ad5867f9dfce81a2baa6b11f7af2c63d4cbde29 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 8 Jun 2022 09:52:27 +0300
Subject: [PATCH 15/76] Use arxiv URL to avoid a too long line

---
 pl_bolts/models/detection/yolo/torch_networks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index 3b0e8b3498..e7078da682 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -84,7 +84,7 @@ class CSPBlock(nn.Module):
 
     Encapsulates a number of bottleneck blocks in the "fusion first" CSP structure.
 
-    `Chien-Yao Wang et al. <https://openaccess.thecvf.com/content_CVPRW_2020/html/w28/Wang_CSPNet_A_New_Backbone_That_Can_Enhance_Learning_Capability_of_CVPRW_2020_paper.html>`_
+    `Chien-Yao Wang et al. <https://arxiv.org/abs/1911.11929>`_
 
     Args:
         in_channels: Number of input channels that the CSP block expects.

From 84a949f1abcd686a97387f7e74841c6ee1f38ce9 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 20 Jun 2022 18:33:20 +0300
Subject: [PATCH 16/76] Use torch.div() instead of //

---
 pl_bolts/models/detection/yolo/layers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index 17569b12cd..a8826ebb26 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -113,7 +113,7 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         """
         batch_size, num_features, height, width = x.shape
         num_attrs = self.num_classes + 5
-        anchors_per_cell = num_features // num_attrs
+        anchors_per_cell = torch.div(num_features, num_attrs, rounding_mode="floor")
         if anchors_per_cell != len(self.prior_shapes):
             raise MisconfigurationException(
                 "The model predicts {} bounding boxes per spatial location, but {} prior box dimensions are defined "
@@ -140,10 +140,11 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         xy = xy * self.xy_scale - 0.5 * (self.xy_scale - 1)
 
         image_xy = global_xy(xy, image_size)
+        prior_shapes = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
         if self.input_is_normalized:
-            image_wh = 4 * torch.square(wh) * torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
+            image_wh = 4 * torch.square(wh) * prior_shapes
         else:
-            image_wh = torch.exp(wh) * torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
+            image_wh = torch.exp(wh) * prior_shapes
         box = torch.cat((image_xy, image_wh), -1)
         box = box_convert(box, in_fmt="cxcywh", out_fmt="xyxy")
         output = torch.cat((box, norm_confidence.unsqueeze(-1), norm_classprob), -1)

From cf1646cb4a97bc5555e5100308aef435382ec657 Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Tue, 2 Aug 2022 23:58:32 +0900
Subject: [PATCH 17/76] remove under_review decorators

---
 pl_bolts/models/detection/yolo/yolo_config.py | 13 +++----------
 pl_bolts/models/detection/yolo/yolo_layers.py | 10 ----------
 pl_bolts/models/detection/yolo/yolo_module.py |  4 ----
 3 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_config.py b/pl_bolts/models/detection/yolo/yolo_config.py
index fea807b1c0..254ee1523b 100644
--- a/pl_bolts/models/detection/yolo/yolo_config.py
+++ b/pl_bolts/models/detection/yolo/yolo_config.py
@@ -6,10 +6,8 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 from pl_bolts.models.detection.yolo import yolo_layers
-from pl_bolts.utils.stability import under_review
 
 
-@under_review()
 class YOLOConfiguration:
     """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation.
 
@@ -149,7 +147,6 @@ def convert(key, value):
         return sections
 
 
-@under_review()
 def _create_layer(config: dict, num_inputs: List[int]) -> Tuple[nn.Module, int]:
     """Calls one of the ``_create_<layertype>(config, num_inputs)`` functions to create a PyTorch module from the
     layer config.
@@ -173,7 +170,6 @@ def _create_layer(config: dict, num_inputs: List[int]) -> Tuple[nn.Module, int]:
     return create_func[config["type"]](config, num_inputs)
 
 
-@under_review()
 def _create_convolutional(config, num_inputs):
     module = nn.Sequential()
 
@@ -210,14 +206,12 @@ def _create_convolutional(config, num_inputs):
     return module, config["filters"]
 
 
-@under_review()
 def _create_maxpool(config, num_inputs):
     padding = (config["size"] - 1) // 2
     module = nn.MaxPool2d(config["size"], config["stride"], padding)
     return module, num_inputs[-1]
 
 
-@under_review()
 def _create_route(config, num_inputs):
     num_chunks = config.get("groups", 1)
     chunk_idx = config.get("group_id", 0)
@@ -234,19 +228,16 @@ def _create_route(config, num_inputs):
     return module, num_outputs
 
 
-@under_review()
 def _create_shortcut(config, num_inputs):
     module = yolo_layers.ShortcutLayer(config["from"])
     return module, num_inputs[-1]
 
 
-@under_review()
 def _create_upsample(config, num_inputs):
     module = nn.Upsample(scale_factor=config["stride"], mode="nearest")
     return module, num_inputs[-1]
 
 
-@under_review()
 def _create_yolo(config, num_inputs):
     # The "anchors" list alternates width and height.
     anchor_dims = config["anchors"]
@@ -264,8 +255,10 @@ def _create_yolo(config, num_inputs):
         overlap_loss_func = yolo_layers.SELoss()
     elif overlap_loss_name == "giou":
         overlap_loss_func = yolo_layers.GIoULoss()
-    else:
+    elif overlap_loss_name == "iou":
         overlap_loss_func = yolo_layers.IoULoss()
+    else:
+        assert False, "Unknown overlap loss: " + overlap_loss_name
 
     module = yolo_layers.DetectionLayer(
         num_classes=config["classes"],
diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/yolo_layers.py
index 9e2d9f7475..9b1ee891df 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/yolo_layers.py
@@ -5,7 +5,6 @@
 from torch import Tensor, nn
 
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
-from pl_bolts.utils.stability import under_review
 from pl_bolts.utils.warnings import warn_missing_pkg
 
 if _TORCHVISION_AVAILABLE:
@@ -21,7 +20,6 @@
     warn_missing_pkg("torchvision")
 
 
-@under_review()
 def _corner_coordinates(xy: Tensor, wh: Tensor) -> Tensor:
     """Converts box center points and sizes to corner coordinates.
 
@@ -38,7 +36,6 @@ def _corner_coordinates(xy: Tensor, wh: Tensor) -> Tensor:
     return torch.cat((top_left, bottom_right), -1)
 
 
-@under_review()
 def _aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
     """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at
     the same coordinates.
@@ -61,7 +58,6 @@ def _aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
     return inter / union
 
 
-@under_review()
 class SELoss(nn.MSELoss):
     def __init__(self):
         super().__init__(reduction="none")
@@ -70,13 +66,11 @@ def forward(self, inputs: Tensor, target: Tensor) -> Tensor:
         return super().forward(inputs, target).sum(1)
 
 
-@under_review()
 class IoULoss(nn.Module):
     def forward(self, inputs: Tensor, target: Tensor) -> Tensor:
         return 1.0 - box_iou(inputs, target).diagonal()
 
 
-@under_review()
 class GIoULoss(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -89,7 +83,6 @@ def forward(self, inputs: Tensor, target: Tensor) -> Tensor:
         return 1.0 - generalized_box_iou(inputs, target).diagonal()
 
 
-@under_review()
 class DetectionLayer(nn.Module):
     """A YOLO detection layer.
 
@@ -468,7 +461,6 @@ def _calculate_losses(
         return losses, hits
 
 
-@under_review()
 class Mish(nn.Module):
     """Mish activation."""
 
@@ -476,7 +468,6 @@ def forward(self, x):
         return x * torch.tanh(nn.functional.softplus(x))
 
 
-@under_review()
 class RouteLayer(nn.Module):
     """Route layer concatenates the output (or part of it) from given layers."""
 
@@ -497,7 +488,6 @@ def forward(self, x, outputs):
         return torch.cat(chunks, dim=1)
 
 
-@under_review()
 class ShortcutLayer(nn.Module):
     """Shortcut layer adds a residual connection from the source layer."""
 
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 6a012f1db9..ebb494f5ef 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -11,7 +11,6 @@
 from pl_bolts.models.detection.yolo.yolo_layers import DetectionLayer, RouteLayer, ShortcutLayer
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
-from pl_bolts.utils.stability import under_review
 from pl_bolts.utils.warnings import warn_missing_pkg
 
 if _TORCHVISION_AVAILABLE:
@@ -23,7 +22,6 @@
 log = logging.getLogger(__name__)
 
 
-@under_review()
 class YOLO(LightningModule):
     """PyTorch Lightning implementation of YOLOv3 and YOLOv4.
 
@@ -455,7 +453,6 @@ def _filter_detections(self, detections: Dict[str, Tensor]) -> Dict[str, List[Te
         return {"boxes": out_boxes, "scores": out_scores, "classprobs": out_classprobs, "labels": out_labels}
 
 
-@under_review()
 class Resize:
     """Rescales the image and target to given dimensions.
 
@@ -486,7 +483,6 @@ def __call__(self, image: Tensor, target: Dict[str, Any]):
         return image, target
 
 
-@under_review()
 def run_cli():
     from argparse import ArgumentParser
 

From fecf88cb512fcc812f8039a5ddc026e32a1c10e0 Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Wed, 3 Aug 2022 00:00:02 +0900
Subject: [PATCH 18/76] add yolo cfg with giou & update related test function

---
 tests/data/yolo_giou.cfg       | 81 ++++++++++++++++++++++++++++++++++
 tests/models/test_detection.py | 11 ++++-
 2 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 tests/data/yolo_giou.cfg

diff --git a/tests/data/yolo_giou.cfg b/tests/data/yolo_giou.cfg
new file mode 100644
index 0000000000..16a96f918d
--- /dev/null
+++ b/tests/data/yolo_giou.cfg
@@ -0,0 +1,81 @@
+[net]
+width=256
+height=256
+channels=3
+
+[convolutional]
+batch_normalize=1
+filters=8
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1
+groups=2
+group_id=1
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=2
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=4
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=14
+activation=linear
+
+[yolo]
+mask=2,3
+anchors=1,2, 3,4, 5,6, 9,10
+classes=2
+iou_loss=giou
+scale_x_y=1.05
+cls_normalizer=1.0
+iou_normalizer=0.07
+ignore_thresh=0.7
+
+[route]
+layers = -4
+
+[upsample]
+stride=2
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=14
+activation=linear
+
+[yolo]
+mask=0,1
+anchors=1,2, 3,4, 5,6, 9,10
+classes=2
+iou_loss=giou
+scale_x_y=1.05
+cls_normalizer=1.0
+iou_normalizer=0.07
+ignore_thresh=0.7
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 31ac35377f..2630182317 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -88,8 +88,15 @@ def test_yolo(tmpdir):
     model(image)
 
 
-def test_yolo_train(tmpdir):
-    config_path = Path(TEST_ROOT) / "data" / "yolo.cfg"
+@pytest.mark.parametrize(
+    "cfg_name",
+    [
+        ("yolo"),
+        ("yolo_giou"),
+    ],
+)
+def test_yolo_train(tmpdir, cfg_name):
+    config_path = Path(TEST_ROOT) / "data" / f"{cfg_name}.cfg"
     config = YOLOConfiguration(config_path)
     model = YOLO(config.get_network())
 

From b5abc8fded4a0f409ad6363cf43f5ce0a796faed Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Wed, 3 Aug 2022 00:01:15 +0900
Subject: [PATCH 19/76] add serveral yolo config & layers function test

---
 tests/models/yolo/__init__.py              |   0
 tests/models/yolo/unit/__init__.py         |   0
 tests/models/yolo/unit/test_yolo_config.py | 104 +++++++++++++++++++++
 tests/models/yolo/unit/test_yolo_layers.py |  36 +++++++
 4 files changed, 140 insertions(+)
 create mode 100644 tests/models/yolo/__init__.py
 create mode 100644 tests/models/yolo/unit/__init__.py
 create mode 100644 tests/models/yolo/unit/test_yolo_config.py
 create mode 100644 tests/models/yolo/unit/test_yolo_layers.py

diff --git a/tests/models/yolo/__init__.py b/tests/models/yolo/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/models/yolo/unit/__init__.py b/tests/models/yolo/unit/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/models/yolo/unit/test_yolo_config.py b/tests/models/yolo/unit/test_yolo_config.py
new file mode 100644
index 0000000000..aab4c2f5a2
--- /dev/null
+++ b/tests/models/yolo/unit/test_yolo_config.py
@@ -0,0 +1,104 @@
+from pathlib import Path
+
+import pytest
+
+from pl_bolts.models.detection.yolo.yolo_config import (
+    YOLOConfiguration,
+    _create_convolutional,
+    _create_maxpool,
+    _create_shortcut,
+    _create_upsample,
+    _create_yolo,
+)
+from tests import TEST_ROOT
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        ({"batch_normalize": 1, "filters": 8, "size": 3, "stride": 1, "pad": 1, "activation": "leaky"}),
+        ({"batch_normalize": 0, "filters": 2, "size": 1, "stride": 1, "pad": 1, "activation": "mish"}),
+        ({"batch_normalize": 1, "filters": 6, "size": 3, "stride": 2, "pad": 1, "activation": "logistic"}),
+        ({"batch_normalize": 0, "filters": 4, "size": 3, "stride": 2, "pad": 0, "activation": "linear"}),
+    ],
+)
+def test_create_convolutional(config):
+    conv = _create_convolutional(config, [3])[0]
+
+    assert conv.conv.out_channels == config["filters"]
+    assert conv.conv.kernel_size == (config["size"], config["size"])
+    assert conv.conv.stride == (config["stride"], config["stride"])
+
+    activation = config["activation"]
+    pad_size = (config["size"] - 1) // 2 if config["pad"] else 0
+
+    if config["pad"]:
+        assert conv.conv.padding == (pad_size, pad_size)
+
+    if config["batch_normalize"]:
+        assert len(conv) == 3
+
+    if activation != "linear":
+        if activation != "logistic":
+            assert activation == conv[-1].__class__.__name__.lower()[: len(activation)]
+        elif activation == "logistic":
+            assert "sigmoid" == conv[-1].__class__.__name__.lower()
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        (
+            {
+                "size": 2,
+                "stride": 2,
+            }
+        ),
+        (
+            {
+                "size": 6,
+                "stride": 3,
+            }
+        ),
+    ],
+)
+def test_create_maxpool(config):
+    pad_size = (config["size"] - 1) // 2
+    maxpool = _create_maxpool(config, [3])[0]
+
+    assert maxpool.kernel_size == config["size"]
+    assert maxpool.stride == config["stride"]
+    assert maxpool.padding == pad_size
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        ({"from": 1, "activation": "linear"}),
+        ({"from": 3, "activation": "linear"}),
+    ],
+)
+def test_create_shortcut(config):
+    shortcut = _create_shortcut(config, [3])[0]
+
+    assert shortcut.source_layer == config["from"]
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        ({"stride": 2}),
+        ({"stride": 4}),
+    ],
+)
+def test_create_upsample(config):
+    upsample = _create_upsample(config, [3])[0]
+
+    assert upsample.scale_factor == float(config["stride"])
+
+
+@pytest.mark.parametrize("config", [("yolo"), ("yolo_giou")])
+def test_yolo_config(config):
+    config_path = Path(TEST_ROOT) / "data" / f"{config}.cfg"
+    config = YOLOConfiguration(config_path)
+    model = config.get_network()
diff --git a/tests/models/yolo/unit/test_yolo_layers.py b/tests/models/yolo/unit/test_yolo_layers.py
new file mode 100644
index 0000000000..5b3013f6f8
--- /dev/null
+++ b/tests/models/yolo/unit/test_yolo_layers.py
@@ -0,0 +1,36 @@
+import pytest
+import torch
+
+from pl_bolts.models.detection.yolo.yolo_layers import GIoULoss, IoULoss, SELoss, _corner_coordinates
+
+
+@pytest.mark.parametrize(
+    "xy, wh, expected",
+    [
+        ([0.0, 0.0], [1.0, 1.0], [-0.5, -0.5, 0.5, 0.5]),
+        ([5.0, 5.0], [2.0, 2.0], [4.0, 4.0, 6.0, 6.0]),
+    ],
+)
+def test_corner_coordinates(xy, wh, expected):
+    xy = torch.tensor(xy)
+    wh = torch.tensor(wh)
+    corners = _corner_coordinates(xy, wh)
+    assert torch.allclose(corners, torch.tensor(expected))
+
+
+@pytest.mark.parametrize(
+    "loss_func, bbox1, bbox2, expected",
+    [
+        (GIoULoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 1.4144532680511475),
+        (IoULoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 1.0),
+        (SELoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 59479.0),
+    ],
+)
+def test_loss_functions(loss_func, bbox1, bbox2, expected):
+    loss_func = loss_func()
+    tensor1 = torch.tensor(bbox1, dtype=torch.float32)
+    tensor2 = torch.tensor(bbox2, dtype=torch.float32)
+
+    loss = loss_func(tensor1, tensor2)
+    assert loss.item() > 0.0
+    assert loss.item() == expected

From 08f17f73a6c05982352754274cc3ba4133f45a55 Mon Sep 17 00:00:00 2001
From: Hongyeob Kim <41847456+heimish-kyma@users.noreply.github.com>
Date: Wed, 3 Aug 2022 22:10:46 +0900
Subject: [PATCH 20/76] remove unused import & variable

---
 tests/models/yolo/unit/test_yolo_config.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/models/yolo/unit/test_yolo_config.py b/tests/models/yolo/unit/test_yolo_config.py
index aab4c2f5a2..58ac900d7e 100644
--- a/tests/models/yolo/unit/test_yolo_config.py
+++ b/tests/models/yolo/unit/test_yolo_config.py
@@ -8,7 +8,6 @@
     _create_maxpool,
     _create_shortcut,
     _create_upsample,
-    _create_yolo,
 )
 from tests import TEST_ROOT
 
@@ -101,4 +100,4 @@ def test_create_upsample(config):
 def test_yolo_config(config):
     config_path = Path(TEST_ROOT) / "data" / f"{config}.cfg"
     config = YOLOConfiguration(config_path)
-    model = config.get_network()
+    config.get_network()

From db2601a4f9f1c17d0bbb32f9361db6d3ee3f2fcc Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Wed, 10 Aug 2022 01:22:08 +0900
Subject: [PATCH 21/76] add type hints

---
 pl_bolts/models/detection/yolo/yolo_config.py | 12 ++++++------
 pl_bolts/models/detection/yolo/yolo_layers.py |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_config.py b/pl_bolts/models/detection/yolo/yolo_config.py
index 254ee1523b..1bbb7eee86 100644
--- a/pl_bolts/models/detection/yolo/yolo_config.py
+++ b/pl_bolts/models/detection/yolo/yolo_config.py
@@ -170,7 +170,7 @@ def _create_layer(config: dict, num_inputs: List[int]) -> Tuple[nn.Module, int]:
     return create_func[config["type"]](config, num_inputs)
 
 
-def _create_convolutional(config, num_inputs):
+def _create_convolutional(config: dict, num_inputs: int) -> Tuple[nn.Module, int]:
     module = nn.Sequential()
 
     batch_normalize = config.get("batch_normalize", False)
@@ -206,13 +206,13 @@ def _create_convolutional(config, num_inputs):
     return module, config["filters"]
 
 
-def _create_maxpool(config, num_inputs):
+def _create_maxpool(config: dict, num_inputs: int) -> Tuple[nn.Module, int]:
     padding = (config["size"] - 1) // 2
     module = nn.MaxPool2d(config["size"], config["stride"], padding)
     return module, num_inputs[-1]
 
 
-def _create_route(config, num_inputs):
+def _create_route(config: dict, num_inputs: int) -> Tuple[nn.Module, int]:
     num_chunks = config.get("groups", 1)
     chunk_idx = config.get("group_id", 0)
 
@@ -228,17 +228,17 @@ def _create_route(config, num_inputs):
     return module, num_outputs
 
 
-def _create_shortcut(config, num_inputs):
+def _create_shortcut(config: dict, num_inputs: int) -> Tuple[nn.Module, int]:
     module = yolo_layers.ShortcutLayer(config["from"])
     return module, num_inputs[-1]
 
 
-def _create_upsample(config, num_inputs):
+def _create_upsample(config: dict, num_inputs: int) -> Tuple[nn.Module, int]:
     module = nn.Upsample(scale_factor=config["stride"], mode="nearest")
     return module, num_inputs[-1]
 
 
-def _create_yolo(config, num_inputs):
+def _create_yolo(config: dict, num_inputs: int) -> Tuple[nn.Module, int]:
     # The "anchors" list alternates width and height.
     anchor_dims = config["anchors"]
     anchor_dims = [(anchor_dims[i], anchor_dims[i + 1]) for i in range(0, len(anchor_dims), 2)]
diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/yolo_layers.py
index 9b1ee891df..011895c5bc 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/yolo_layers.py
@@ -1,4 +1,4 @@
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -464,7 +464,7 @@ def _calculate_losses(
 class Mish(nn.Module):
     """Mish activation."""
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         return x * torch.tanh(nn.functional.softplus(x))
 
 
@@ -483,7 +483,7 @@ def __init__(self, source_layers: List[int], num_chunks: int, chunk_idx: int) ->
         self.num_chunks = num_chunks
         self.chunk_idx = chunk_idx
 
-    def forward(self, x, outputs):
+    def forward(self, x, outputs: List[Union[Tensor, None]]) -> Tensor:
         chunks = [torch.chunk(outputs[layer], self.num_chunks, dim=1)[self.chunk_idx] for layer in self.source_layers]
         return torch.cat(chunks, dim=1)
 
@@ -500,5 +500,5 @@ def __init__(self, source_layer: int) -> None:
         super().__init__()
         self.source_layer = source_layer
 
-    def forward(self, x, outputs):
+    def forward(self, x, outputs: List[Union[Tensor, None]]) -> Tensor:
         return outputs[-1] + outputs[self.source_layer]

From fe38bb760b1c90539b3b1756b7524adf12358bf2 Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Wed, 10 Aug 2022 01:24:34 +0900
Subject: [PATCH 22/76] remove and merge duplicated test

---
 tests/models/test_detection.py             |  5 +++--
 tests/models/yolo/unit/test_yolo_config.py | 14 +-------------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 2630182317..64b202b0ea 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -79,8 +79,9 @@ def test_fasterrcnn_pyt_module_bbone_train(tmpdir):
     trainer.fit(model, train_dl, valid_dl)
 
 
-def test_yolo(tmpdir):
-    config_path = Path(TEST_ROOT) / "data" / "yolo.cfg"
+@pytest.mark.parametrize("config", [("yolo"), ("yolo_giou")])
+def test_yolo(config):
+    config_path = Path(TEST_ROOT) / "data" / f"{config}.cfg"
     config = YOLOConfiguration(config_path)
     model = YOLO(config.get_network())
 
diff --git a/tests/models/yolo/unit/test_yolo_config.py b/tests/models/yolo/unit/test_yolo_config.py
index aab4c2f5a2..2b2ec7833f 100644
--- a/tests/models/yolo/unit/test_yolo_config.py
+++ b/tests/models/yolo/unit/test_yolo_config.py
@@ -1,16 +1,11 @@
-from pathlib import Path
-
 import pytest
 
 from pl_bolts.models.detection.yolo.yolo_config import (
-    YOLOConfiguration,
     _create_convolutional,
     _create_maxpool,
     _create_shortcut,
     _create_upsample,
-    _create_yolo,
 )
-from tests import TEST_ROOT
 
 
 @pytest.mark.parametrize(
@@ -92,13 +87,6 @@ def test_create_shortcut(config):
     ],
 )
 def test_create_upsample(config):
-    upsample = _create_upsample(config, [3])[0]
+    upsample, _ = _create_upsample(config, [3])
 
     assert upsample.scale_factor == float(config["stride"])
-
-
-@pytest.mark.parametrize("config", [("yolo"), ("yolo_giou")])
-def test_yolo_config(config):
-    config_path = Path(TEST_ROOT) / "data" / f"{config}.cfg"
-    config = YOLOConfiguration(config_path)
-    model = config.get_network()

From 7da9d4a4dabd5e596026b76f58e0ac6eb5617520 Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Wed, 10 Aug 2022 01:25:29 +0900
Subject: [PATCH 23/76] improve readability

---
 tests/models/yolo/unit/test_yolo_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/yolo/unit/test_yolo_config.py b/tests/models/yolo/unit/test_yolo_config.py
index 2b2ec7833f..e3a33e5a70 100644
--- a/tests/models/yolo/unit/test_yolo_config.py
+++ b/tests/models/yolo/unit/test_yolo_config.py
@@ -18,7 +18,7 @@
     ],
 )
 def test_create_convolutional(config):
-    conv = _create_convolutional(config, [3])[0]
+    conv, _ = _create_convolutional(config, [3])
 
     assert conv.conv.out_channels == config["filters"]
     assert conv.conv.kernel_size == (config["size"], config["size"])
@@ -59,7 +59,7 @@ def test_create_convolutional(config):
 )
 def test_create_maxpool(config):
     pad_size = (config["size"] - 1) // 2
-    maxpool = _create_maxpool(config, [3])[0]
+    maxpool, _ = _create_maxpool(config, [3])
 
     assert maxpool.kernel_size == config["size"]
     assert maxpool.stride == config["stride"]
@@ -74,7 +74,7 @@ def test_create_maxpool(config):
     ],
 )
 def test_create_shortcut(config):
-    shortcut = _create_shortcut(config, [3])[0]
+    shortcut, _ = _create_shortcut(config, [3])
 
     assert shortcut.source_layer == config["from"]
 

From 63c7eeff7e052f9c62652c25097163bacb202544 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Fri, 19 Aug 2022 15:32:07 +0300
Subject: [PATCH 24/76] Use distance_box_iou(), complete_box_iou() and the
 corresponding loss functions from Torchvision

---
 .../models/detection/yolo/darknet_network.py  | 106 ++++-
 pl_bolts/models/detection/yolo/layers.py      |  21 +-
 pl_bolts/models/detection/yolo/loss.py        | 381 ++++++++++--------
 .../models/detection/yolo/target_matching.py  |   6 +-
 .../models/detection/yolo/torch_networks.py   | 153 +++----
 pl_bolts/models/detection/yolo/yolo_module.py |  41 +-
 6 files changed, 414 insertions(+), 294 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index 2788122742..52765a7489 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -41,16 +41,17 @@ def __init__(
             config_path: Path to a Darknet configuration file that defines the network architecture.
             weights_path: Path to a Darknet model file. If given, the model weights will be read from this file.
             in_channels: Number of channels in the input image.
-            match_sim_ota: If ``True``, matches a target to an anchor using the SimOTA algorithm from YOLOX.
-            match_size_ratio: If specified, matches a target to an anchor if its width and height relative to the anchor
-                is smaller than this ratio. If ``match_size_ratio`` or ``match_iou_threshold`` is not specified, selects
-                for each target the anchor with the highest IoU.
-            match_iou_threshold: If specified, matches a target to an anchor if the IoU is higher than this threshold.
-            ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has
-                IoU with some target greater than this threshold, the predictor will not be taken into account when
-                calculating the confidence loss.
-            overlap_func: Which function to use for calculating the overlap between boxes. Valid values are "iou",
-                "giou", "diou", and "ciou".
+            matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching
+                rule from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is
+                below given ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the
+                prior shape that gives the highest IoU, default).
+            matching_threshold: Threshold for "size" and "iou" matching algorithms.
+            ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding
+                anchor has IoU with some target greater than this threshold, the predictor will not be taken into
+                account when calculating the confidence loss.
+            overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or
+                a function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou",
+                and "ciou".
             predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
                 confidence is one if there's an object, and 1.0 means that the target confidence is the output of
                 ``overlap_func``.
@@ -293,6 +294,16 @@ def _create_layer(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREAT
 
 
 def _create_convolutional(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a convolutional layer.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
     batch_normalize = config.get("batch_normalize", False)
     padding = (config["size"] - 1) // 2 if config["pad"] else 0
 
@@ -313,12 +324,33 @@ def _create_maxpool(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CRE
     """Creates a max pooling layer.
 
     Padding is added so that the output resolution will be the input resolution divided by stride, rounded upwards.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
     """
     layer = MaxPool(config["size"], config["stride"])
     return layer, num_inputs[-1]
 
 
 def _create_route(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a routing layer.
+
+    A routing layer concatenates the output (or part of it) from the layers specified by the "layers" configuration
+    option.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
     num_chunks = config.get("groups", 1)
     chunk_idx = config.get("group_id", 0)
 
@@ -335,11 +367,33 @@ def _create_route(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREAT
 
 
 def _create_shortcut(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a shortcut layer.
+
+    A shortcut layer adds a residual connection from the layer specified by the "from" configuration option.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
     layer = layers.ShortcutLayer(config["from"])
     return layer, num_inputs[-1]
 
 
 def _create_upsample(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a layer that upsamples the data.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
     layer = nn.Upsample(scale_factor=config["stride"], mode="nearest")
     return layer, num_inputs[-1]
 
@@ -358,6 +412,38 @@ def _create_yolo(
     class_loss_multiplier: Optional[float] = None,
     **kwargs: Any,
 ) -> CREATE_LAYER_OUTPUT:
+    """Creates a YOLO detection layer.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer. Not used by the detection layer.
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain (width, height) tuples in the network input
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per spatial location. They
+            are assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning
+            that you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output (always 0 for a detection layer).
+    """
     if prior_shapes is None:
         # The "anchors" list alternates width and height.
         dims = config["anchors"]
diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index a8826ebb26..cfd0590140 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -113,7 +113,7 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         """
         batch_size, num_features, height, width = x.shape
         num_attrs = self.num_classes + 5
-        anchors_per_cell = torch.div(num_features, num_attrs, rounding_mode="floor")
+        anchors_per_cell = int(torch.div(num_features, num_attrs, rounding_mode="floor"))
         if anchors_per_cell != len(self.prior_shapes):
             raise MisconfigurationException(
                 "The model predicts {} bounding boxes per spatial location, but {} prior box dimensions are defined "
@@ -205,9 +205,8 @@ def _calculate_losses(
             "boxes": torch.cat(tuple(m[1]["boxes"] for m in matches)),
             "labels": torch.cat(tuple(m[1]["labels"] for m in matches)),
         }
-        self.loss_func(matched_preds, matched_targets, self.input_is_normalized, image_size)
-        overlap_loss, confidence_loss, class_loss = self.loss_func.sums()
-        self.losses = torch.stack((overlap_loss, confidence_loss, class_loss)) / batch_size
+        losses = self.loss_func.elementwise_sums(matched_preds, matched_targets, self.input_is_normalized, image_size)
+        self.losses = torch.stack((losses.overlap, losses.confidence, losses.classification)) / batch_size
         self.hits = len(matched_targets["boxes"])
 
 
@@ -276,7 +275,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class RouteLayer(nn.Module):
-    """Route layer concatenates the output (or part of it) from given layers.
+    """A routing layer concatenates the output (or part of it) from given layers.
 
     Args:
         source_layers: Indices of the layers whose output will be concatenated.
@@ -296,7 +295,7 @@ def forward(self, x: Tensor, outputs: List[Tensor]) -> Tensor:
 
 
 class ShortcutLayer(nn.Module):
-    """Shortcut layer adds a residual connection from the source layer.
+    """A shortcut layer adds a residual connection from the source layer.
 
     Args:
         source_layer: Index of the layer whose output will be added to the output of the previous layer.
@@ -387,16 +386,20 @@ def create_detection_layer(
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
-            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
-            "iou", "giou", "diou", and "ciou" (default).
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou" (default).
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
-        class_loss_multiplier: Classification loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        num_classes: Number of different classes that this layer predicts.
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
             to produce coordinate values close to one.
+        input_is_normalized: The input is normalized by logistic activation in the previous layer. In this case the
+            detection layer will not take the sigmoid of the coordinate and probability predictions, and the width and
+            height are scaled up so that the maximum value is four times the anchor dimension.
     """
     matching_func: Union[ShapeMatching, SimOTAMatching]
     if matching_algorithm == "simota":
diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index 7fef0ecff0..e00c73aea1 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -1,91 +1,166 @@
-import math
+from dataclasses import dataclass
 from typing import Callable, Dict, Optional, Tuple, Union
 
 import torch
-
-try:
-    from pytorch_lightning.utilities.rank_zero import rank_zero_warn
-except ModuleNotFoundError:
-    from pytorch_lightning.utilities.distributed import rank_zero_warn
-
 from torch import Tensor
 from torch.nn.functional import binary_cross_entropy, binary_cross_entropy_with_logits
 
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
 
-if _TORCHVISION_AVAILABLE:
-    from torchvision.ops import box_iou, generalized_box_iou
-else:
+if not _TORCHVISION_AVAILABLE:
     warn_missing_pkg("torchvision")
+else:
+    from torchvision.ops import box_iou
+
+    try:
+        from torchvision.ops import generalized_box_iou
+    except ImportError:
+        generalized_box_iou = None
+    try:
+        from torchvision.ops import generalized_box_iou_loss
+    except ImportError:
+        generalized_box_iou_loss = None
+    try:
+        from torchvision.ops import distance_box_iou
+    except ImportError:
+        distance_box_iou = None
+    try:
+        from torchvision.ops import distance_box_iou_loss
+    except ImportError:
+        distance_box_iou_loss = None
+    try:
+        from torchvision.ops import complete_box_iou
+    except ImportError:
+        complete_box_iou = None
+    try:
+        from torchvision.ops import complete_box_iou_loss
+    except ImportError:
+        complete_box_iou_loss = None
+
+
+def _size_compensation(targets: Tensor, image_size: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calcuates the size compensation factor for the overlap loss.
+
+    The overlap losses for each target should be multiplied by the returned weight. The returned value is
+    `2 - (unit_width * unit_height)`, which is large for small boxes (the maximum value is 2) and small for large boxes
+    (the minimum value is 1).
 
+    Args:
+        targets: An ``[N, 4]`` matrix of target `(x1, y1, x2, y2)` coordinates.
+        image_size: Image size, which is used to scale the target boxes to unit coordinates.
+
+    Returns:
+        The size compensation factor.
+    """
+    unit_wh = targets[:, 2:] / image_size
+    return 2 - (unit_wh[:, 0] * unit_wh[:, 1])
 
-def _upcast(t: Tensor) -> Tensor:
-    """Protects from numerical overflows in multiplications by upcasting to the equivalent higher type."""
-    if t.is_floating_point():
-        return t if t.dtype in (torch.float32, torch.float64) else t.float()
-    else:
-        return t if t.dtype in (torch.int32, torch.int64) else t.int()
 
+def _pairwise_confidence_loss(
+    preds: Tensor, overlap: Tensor, bce_func: Callable, predict_overlap: Optional[float]
+) -> Tensor:
+    """Calculates the confidence loss for every pair of a foreground anchor and a target.
 
-def complete_iou(boxes1: Tensor, boxes2: Tensor, distance_only: bool = False) -> Tensor:
-    """Returns the complete intersection-over-union between two sets of boxes. Both sets of boxes are expected to
-    be in `(x1, y1, x2, y2)` format.
+    If ``predict_overlap`` is ``True``, ``overlap`` will be used as the target confidence. Otherwise the target
+    confidence is 1. The method returns a matrix of losses for target/prediction pairs.
 
     Args:
-        boxes1: Box coordinates in a tensor of size ``[N, 4]``.
-        boxes2: Box coordinates in a tensor of size ``[M, 4]``.
-        distance_only: If set to ``True``, returns the Distance IoU.
+        preds: An ``[N]`` vector of predicted confidences.
+        overlap: An ``[M, N]`` matrix of overlaps between all target and predicted bounding boxes.
+        bce_func: A function for calculating binary cross entropy.
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the overlap.
 
     Returns:
-        A matrix containing the `NxM` complete IoU values between boxes from ``boxes1`` and ``boxes2``.
+        An ``[M, N]`` matrix of confidence losses between all targets and predictions.
     """
+    if predict_overlap is not None:
+        # When predicting overlap, target confidence is different for each pair of a prediction and a target. The
+        # tensors have to be broadcasted to [M, N].
+        preds = preds.unsqueeze(0).expand(overlap.shape)
+        targets = torch.ones_like(preds) - predict_overlap
+        # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
+        targets += predict_overlap * overlap.detach().clamp(min=0)
+        return bce_func(preds, targets, reduction="none")
+    else:
+        # When not predicting overlap, target confidence is the same for every target, but we should still return a
+        # matrix.
+        targets = torch.ones_like(preds)
+        return bce_func(preds, targets, reduction="none").unsqueeze(0).expand(overlap.shape)
+
 
-    # Degenerate boxes give inf / nan results, so do an early check.
-    if not ((boxes1[:, 2:] >= boxes1[:, :2]).all() and (boxes2[:, 2:] >= boxes2[:, :2]).all()):
-        rank_zero_warn("Some boxes have negative width or height, or the coordinates contain infinite or NaN values.")
+def _foreground_confidence_loss(
+    preds: Tensor, overlap: Tensor, bce_func: Callable, predict_overlap: Optional[float]
+) -> Tensor:
+    """Calculates the sum of the confidence losses for foreground anchors and their matched targets.
 
-    iou = box_iou(boxes1, boxes2)
+    If ``predict_overlap`` is ``True``, ``overlap`` will be used as the target confidence. Otherwise the target
+    confidence is 1. The method returns a vector of losses for each foreground anchor.
 
-    boxes1 = boxes1.unsqueeze(1)  # [N, 1, 4]
-    boxes2 = boxes2.unsqueeze(0)  # [1, M, 4]
+    Args:
+        preds: A vector of predicted confidences.
+        overlap: A vector of overlaps between matched target and predicted bounding boxes.
+        bce_func: A function for calculating binary cross entropy.
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the overlap.
 
-    lti = torch.min(boxes1[..., :2], boxes2[..., :2])
-    rbi = torch.max(boxes1[..., 2:], boxes2[..., 2:])
+    Returns:
+        The sum of the confidence losses for foreground anchors.
+    """
+    targets = torch.ones_like(preds)
+    if predict_overlap is not None:
+        targets -= predict_overlap
+        # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
+        targets += predict_overlap * overlap.detach().clamp(min=0)
+    return bce_func(preds, targets, reduction="sum")
 
-    whi = _upcast(rbi - lti).clamp(min=0)  # [N, M, 2]
-    wi = whi[..., 0]
-    hi = whi[..., 1]
-    sqr_length = wi * wi + hi * hi  # [N, M]
 
-    wh1 = boxes1[..., 2:] - boxes1[..., :2]
-    wh2 = boxes2[..., 2:] - boxes2[..., :2]
-    center1 = boxes1[..., :2] + (wh1 / 2)
-    center2 = boxes2[..., :2] + (wh2 / 2)
-    offset = center2 - center1  # [N, M, 2]
-    dx = offset[..., 0]
-    dy = offset[..., 1]
-    sqr_distance = dx * dx + dy * dy  # [N, M]
+def _background_confidence_loss(preds: Tensor, bce_func: Callable) -> Tensor:
+    """Calculates the sum of the confidence losses for background anchors.
 
-    diou = torch.where(sqr_length > 0.0, iou - (sqr_distance / sqr_length), iou)
-    if distance_only:
-        return diou
+    Args:
+        preds: A vector of predicted confidences for background anchors.
+        bce_func: A function for calculating binary cross entropy.
 
-    w1 = wh1[..., 0]
-    h1 = wh1[..., 1]
-    w2 = wh2[..., 0]
-    h2 = wh2[..., 1]
-    daspect = torch.atan(w2 / h2) - torch.atan(w1 / h1)  # [N, M]
-    aspect_loss = 4 / (math.pi * math.pi) * (daspect * daspect)
+    Returns:
+        The sum of the background confidence losses.
+    """
+    targets = torch.zeros_like(preds)
+    return bce_func(preds, targets, reduction="sum")
 
-    with torch.no_grad():
-        alpha = aspect_loss / (1 - iou + aspect_loss + 1e-6)
 
-    return diou - (alpha * aspect_loss)
+def _target_labels_to_probs(targets: Tensor, num_classes: int, dtype: torch.dtype) -> Tensor:
+    """If ``targets`` is a vector of class labels, converts it to a matrix of one-hot class probabilities.
 
+    Args:
+        targets: An ``[M, C]`` matrix of target class probabilities or an ``[M]`` vector of class labels.
+        num_classes: The number of classes (C dimension) for the new targets. If ``targets`` is already two-dimensional,
+            checks that the length of the second dimension matches this number.
+        dtype: Floating-point data type to be used for the one-hot targets.
 
-def distance_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
-    return complete_iou(boxes1, boxes2, distance_only=True)
+    Returns:
+        An ``[M, C]`` matrix of target class probabilities.
+    """
+    if targets.ndim == 1:
+        # The data may contain a different number of classes than what the model predicts. In case a label is
+        # greater than the number of predicted classes, it will be mapped to the last class.
+        last_class = torch.tensor(num_classes - 1, device=targets.device)
+        targets = torch.min(targets, last_class)
+        targets = torch.nn.functional.one_hot(targets, num_classes)
+    elif targets.shape[-1] != num_classes:
+        raise ValueError(
+            f"The number of classes in the data ({targets.shape[-1]}) doesn't match the number of classes "
+            f"predicted by the model ({num_classes})."
+        )
+    return targets.to(dtype=dtype)
+
+
+@dataclass
+class Losses:
+    overlap: Tensor
+    confidence: Tensor
+    classification: Tensor
 
 
 class LossFunction:
@@ -93,14 +168,14 @@ class LossFunction:
 
     Args:
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
-            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
-            "iou", "giou", "diou", and "ciou" (default).
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou" (default).
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
-        class_loss_multiplier: Classification loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
     """
 
     def __init__(
@@ -111,166 +186,114 @@ def __init__(
         confidence_multiplier: float = 1.0,
         class_multiplier: float = 1.0,
     ):
+        overlap_loss_func = None
         if overlap_func == "iou":
-            self.overlap_func = box_iou
+            overlap_func = box_iou
         elif overlap_func == "giou":
-            self.overlap_func = generalized_box_iou
+            overlap_func = generalized_box_iou
+            overlap_loss_func = generalized_box_iou_loss
         elif overlap_func == "diou":
-            self.overlap_func = distance_iou
+            overlap_func = distance_box_iou
+            overlap_loss_func = distance_box_iou_loss
         elif overlap_func == "ciou":
-            self.overlap_func = complete_iou
-        elif callable(overlap_func):
-            self.overlap_func = overlap_func
+            overlap_func = complete_box_iou
+            overlap_loss_func = complete_box_iou_loss
+
+        if not callable(overlap_func):
+            raise ValueError("Unsupported overlap function. Try upgrading Torcvision or using another IoU algorithm.")
+        self._pairwise_overlap = overlap_func
+
+        if callable(overlap_loss_func):
+            self._elementwise_overlap_loss = overlap_loss_func
         else:
-            raise ValueError(f"Overlap function type `{overlap_func}´ is unknown.")
+            self._elementwise_overlap_loss = lambda boxes1, boxes2: 1.0 - overlap_func(boxes1, boxes2).diagonal()
 
         self.predict_overlap = predict_overlap
-
         self.overlap_multiplier = overlap_multiplier
         self.confidence_multiplier = confidence_multiplier
         self.class_multiplier = class_multiplier
 
-    def _calculate_overlap(
-        self, preds: Tensor, targets: Tensor, image_size: Optional[Tensor] = None
-    ) -> Tuple[Tensor, Tensor]:
-        """Calculates the overlap and overlap loss.
-
-        The overlap is calculated using ``self.overlap_func``. Overlap loss is ``1 - overlap``. If ``image_size`` is
-        given, the loss is scaled by a factor that is large for small boxes (the maximum value is 2) and small for large
-        boxes (the minimum value is 1).
-
-        Args:
-            preds: An ``[N, 4]`` matrix of predicted `(x1, y1, x2, y2)` coordinates.
-            targets: An ``[M, 4]`` matrix of target `(x1, y1, x2, y2)` coordinates.
-            image_size: If given,
-
-        Returns:
-            overlap, overlap_loss: Two ``[M, N]`` matrices: the overlap and the overlap loss between all combinations of
-                a target and a prediction.
-        """
-        overlap = self.overlap_func(targets, preds)
-        overlap_loss = 1.0 - overlap
-        if image_size is not None:
-            unit_wh = targets[:, 2:] / image_size
-            size_compensation = 2 - (unit_wh[:, 0] * unit_wh[:, 1])
-            overlap_loss = overlap_loss * size_compensation
-        return overlap, overlap_loss
-
-    def _calculate_confidence(self, preds: Tensor, overlap: Tensor, bce_func: Callable) -> Tensor:
-        """Calculates the confidence loss for foreground anchors.
+    def pairwise(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        input_is_normalized: bool,
+    ) -> Tuple[Losses, Tensor]:
+        """Calculates matrices containing the losses for all prediction/target pairs.
 
-        If ``self.predict_overlap`` is ``True``, ``overlap`` will be used as the target confidence. Otherwise the target
-        confidence is 1. The method returns a matrix of losses for target/prediction pairs.
+        This method is called for obtaining costs for SimOTA matching.
 
         Args:
-            preds: An ``[N]`` vector of predicted confidences.
-            overlap: An ``[M, N]`` matrix of the overlap between all combinations of a target bounding box and a
-                predicted bounding box.
-            bce_func: A function for calculating binary cross entropy.
+            preds: A dictionary of predictions, containing "boxes", "confidences", and "classprobs".
+            targets: A dictionary of training targets, containing "boxes" and "labels".
+            input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
 
         Returns:
-            An ``[M, N]`` matrix of confidence loss between all combinations of a target and a prediction.
+            Loss matrices and an overlap matrix.
         """
-        if self.predict_overlap is not None:
-            # When predicting overlap, target confidence is different for each pair of a prediction and a target. The
-            # tensors have to be broadcasted to [M, N].
-            preds = preds.unsqueeze(0).expand(overlap.shape)
-            targets = torch.ones_like(preds) - self.predict_overlap
-            # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
-            targets = targets + (self.predict_overlap * overlap.detach().clamp(min=0))
+        if input_is_normalized:
+            bce_func = binary_cross_entropy
         else:
-            targets = torch.ones_like(preds)
-
-        result = bce_func(preds, targets, reduction="none")
-
-        if result.ndim == 1:
-            # When not predicting overlap, target confidence is the same for every target, but we should still return a
-            # matrix.
-            result = result.unsqueeze(0).expand(overlap.shape)
+            bce_func = binary_cross_entropy_with_logits
 
-        return result
+        overlap = self._pairwise_overlap(targets["boxes"], preds["boxes"])
+        overlap_loss = 1.0 - overlap
 
-    def _calculate_bg_confidence(self, preds: Tensor, bce_func: Callable) -> Tensor:
-        """Calculates the confidence loss for background anchors."""
-        targets = torch.zeros_like(preds)
-        return bce_func(preds, targets, reduction="none")
+        confidence_loss = _pairwise_confidence_loss(preds["confidences"], overlap, bce_func, self.predict_overlap)
 
-    def _calculate_class(self, preds: Tensor, targets: Tensor, bce_func: Callable) -> Tensor:
-        """Calculates the classification losses.
+        pred_probs = preds["classprobs"].unsqueeze(0)  # [1, preds, classes]
+        target_probs = _target_labels_to_probs(targets["labels"], pred_probs.shape[-1], pred_probs.dtype)
+        target_probs = target_probs.unsqueeze(1)  # [targets, 1, classes]
+        pred_probs, target_probs = torch.broadcast_tensors(pred_probs, target_probs)
+        class_loss = bce_func(pred_probs, target_probs, reduction="none").sum(-1)
 
-        If ``targets`` is a vector of class labels, converts it to a matrix of one-hot class probabilities. Then
-        calculates the classification losses between the predictions and the targets. If ``all_pairs`` is ``True``,
-        returns a matrix of losses between all combinations of a target and a prediction.
+        losses = Losses(
+            overlap_loss * self.overlap_multiplier,
+            confidence_loss * self.confidence_multiplier,
+            class_loss * self.class_multiplier,
+        )
 
-        Args:
-            preds: An ``[N, C]`` matrix of predicted class probabilities.
-            targets: An ``[M, C]`` matrix of target class probabilities or an ``[M]`` vector of class labels.
-            bce_func: A function for calculating binary cross entropy.
+        return losses, overlap
 
-        Returns:
-            An ``[M, N]`` matrix of losses between all combinations of a target and a prediction.
-        """
-        num_classes = preds.shape[-1]
-        if targets.ndim == 1:
-            # The data may contain a different number of classes than what the model predicts. In case a label is
-            # greater than the number of predicted classes, it will be mapped to the last class.
-            last_class = torch.tensor(num_classes - 1, device=targets.device)
-            targets = torch.min(targets, last_class)
-            targets = torch.nn.functional.one_hot(targets, num_classes)
-        elif targets.shape[-1] != num_classes:
-            raise ValueError(
-                f"The number of classes in the data ({targets.shape[-1]}) doesn't match the number of classes "
-                f"predicted by the model ({num_classes})."
-            )
-        targets = targets.to(dtype=preds.dtype)
-
-        preds = preds.unsqueeze(0)  # [1, preds, classes]
-        targets = targets.unsqueeze(1)  # [targets, 1, classes]
-        preds, targets = torch.broadcast_tensors(preds, targets)
-        return bce_func(preds, targets, reduction="none").sum(-1)
-
-    def __call__(
+    def elementwise_sums(
         self,
         preds: Dict[str, Tensor],
         targets: Dict[str, Tensor],
         input_is_normalized: bool,
-        image_size: Optional[Tensor] = None,
-    ) -> None:
-        """Calculates the losses for all pairs of a predictions and a target, and if `bg_confidences` appears in
-        ``preds``, calculates the confidence loss for background predictions.
-
-        This method is called before taking the final losses using ``sums()``, and for obtaining costs for SimOTA
-        matching.
+        image_size: Tensor,
+    ) -> Losses:
+        """Calculates the sums of the losses for optimization, over prediction/target pairs, assuming the
+        predictions and targets have been matched (there are as many predictions and targets).
 
         Args:
             preds: A dictionary of predictions, containing "boxes", "confidences", and "classprobs".
             targets: A dictionary of training targets, containing "boxes" and "labels".
             input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
             image_size: Width and height in a vector that defines the scale of the target coordinates.
+
+        Returns:
+            The final losses.
         """
         if input_is_normalized:
             bce_func = binary_cross_entropy
         else:
             bce_func = binary_cross_entropy_with_logits
 
-        overlap, overlap_loss = self._calculate_overlap(preds["boxes"], targets["boxes"], image_size)
-        self.overlap = overlap
-        self.overlap_loss = overlap_loss * self.overlap_multiplier
+        overlap_loss = self._elementwise_overlap_loss(targets["boxes"], preds["boxes"])
+        overlap = 1.0 - overlap_loss
+        overlap_loss = (overlap_loss * _size_compensation(targets["boxes"], image_size)).sum()
 
-        confidence_loss = self._calculate_confidence(preds["confidences"], overlap, bce_func)
-        self.confidence_loss = confidence_loss * self.confidence_multiplier
+        confidence_loss = _foreground_confidence_loss(preds["confidences"], overlap, bce_func, self.predict_overlap)
+        confidence_loss += _background_confidence_loss(preds["bg_confidences"], bce_func)
 
-        if "bg_confidences" in preds:
-            bg_confidence_loss = self._calculate_bg_confidence(preds["bg_confidences"], bce_func)
-            self.bg_confidence_loss = bg_confidence_loss * self.confidence_multiplier
+        pred_probs = preds["classprobs"]
+        target_probs = _target_labels_to_probs(targets["labels"], pred_probs.shape[-1], pred_probs.dtype)
+        class_loss = bce_func(pred_probs, target_probs, reduction="sum")
 
-        class_loss = self._calculate_class(preds["classprobs"], targets["labels"], bce_func)
-        self.class_loss = class_loss * self.class_multiplier
+        losses = Losses(
+            overlap_loss * self.overlap_multiplier,
+            confidence_loss * self.confidence_multiplier,
+            class_loss * self.class_multiplier,
+        )
 
-    def sums(self) -> Tuple[Tensor, Tensor, Tensor]:
-        """Returns the sums of the losses over prediction/target pairs, assuming the predictions and targets have
-        been matched (there are as many predictions and targets)."""
-        overlap_loss = self.overlap_loss.diagonal().sum()
-        confidence_loss = self.confidence_loss.diagonal().sum() + self.bg_confidence_loss.sum()
-        class_loss = self.class_loss.diagonal().sum()
-        return overlap_loss, confidence_loss, class_loss
+        return losses
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index ba38e0c4f0..b8dd6f20c5 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -314,10 +314,10 @@ def __call__(
         }
         bg_confidences = preds["confidences"].view(shape)[bg_mask].view(-1)
 
-        self.loss_func(fg_preds, targets, input_is_normalized=False)
-        costs = self.loss_func.overlap_loss + self.loss_func.confidence_loss + self.loss_func.class_loss
+        losses, ious = self.loss_func.pairwise(fg_preds, targets, input_is_normalized=False)
+        costs = losses.overlap + losses.confidence + losses.classification
         costs += 100000.0 * ~inside_matrix[:, fg_mask].repeat_interleave(boxes_per_cell, 1)
-        matched_preds, matched_targets = _sim_ota_match(costs, self.loss_func.overlap)
+        matched_preds, matched_targets = _sim_ota_match(costs, ious)
 
         preds = {
             "boxes": fg_preds["boxes"][matched_preds],
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index e7078da682..4204112f63 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -371,14 +371,14 @@ class YOLOV4TinyNetwork(nn.Module):
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
-            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
-            "iou", "giou", "diou", and "ciou" (default).
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou" (default).
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
-        class_loss_multiplier: Classification loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
             to produce coordinate values close to one.
     """
@@ -390,7 +390,7 @@ def __init__(
         width: int = 32,
         activation: Optional[str] = "leaky",
         normalization: Optional[str] = "batchnorm",
-        prior_shapes: List[Tuple[int, int]] = None,
+        prior_shapes: Optional[List[Tuple[int, int]]] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__()
@@ -432,7 +432,7 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
                 prior_shapes, prior_shape_idxs, num_classes=num_classes, input_is_normalized=False, **kwargs
             )
 
-        self.backbone = backbone or YOLOV4TinyBackbone(width=width, normalization=normalization, activation=activation)
+        self.backbone = backbone or YOLOV4TinyBackbone(width=width, activation=activation, normalization=normalization)
 
         self.fpn5 = conv(width * 16, width * 8)
         self.out5 = nn.Sequential(
@@ -516,14 +516,14 @@ class YOLOV4Network(nn.Module):
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
-            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
-            "iou", "giou", "diou", and "ciou" (default).
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou" (default).
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
-        class_loss_multiplier: Classification loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
             to produce coordinate values close to one.
     """
@@ -535,7 +535,7 @@ def __init__(
         widths: Sequence[int] = (32, 64, 128, 256, 512, 1024),
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
-        prior_shapes: List[Tuple[int, int]] = None,
+        prior_shapes: Optional[List[Tuple[int, int]]] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__()
@@ -598,7 +598,7 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
         if backbone is not None:
             self.backbone = backbone
         else:
-            self.backbone = YOLOV4Backbone(widths=widths, normalization=normalization, activation=activation)
+            self.backbone = YOLOV4Backbone(widths=widths, activation=activation, normalization=normalization)
 
         w3 = widths[-3]
         w4 = widths[-2]
@@ -692,14 +692,14 @@ class YOLOV4P6Network(nn.Module):
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
-            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
-            "iou", "giou", "diou", and "ciou" (default).
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou" (default).
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
-        class_loss_multiplier: Classification loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
             to produce coordinate values close to one.
     """
@@ -711,7 +711,7 @@ def __init__(
         widths: Sequence[int] = (32, 64, 128, 256, 512, 1024, 1024),
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
-        prior_shapes: List[Tuple[int, int]] = None,
+        prior_shapes: Optional[List[Tuple[int, int]]] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__()
@@ -782,7 +782,7 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
             self.backbone = backbone
         else:
             self.backbone = YOLOV4Backbone(
-                widths=widths, depths=(1, 1, 3, 15, 15, 7, 7), normalization=normalization, activation=activation
+                widths=widths, depths=(1, 1, 3, 15, 15, 7, 7), activation=activation, normalization=normalization
             )
 
         w3 = widths[-4]
@@ -900,14 +900,14 @@ class YOLOV5Network(nn.Module):
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
-            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
-            "iou", "giou", "diou", and "ciou" (default).
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou" (default).
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
-        class_loss_multiplier: Classification loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
             to produce coordinate values close to one.
     """
@@ -920,7 +920,7 @@ def __init__(
         depth: int = 3,
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
-        prior_shapes: List[Tuple[int, int]] = None,
+        prior_shapes: Optional[List[Tuple[int, int]]] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__()
@@ -975,7 +975,7 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
             )
 
         self.backbone = backbone or YOLOV5Backbone(
-            depth=depth, width=width, normalization=normalization, activation=activation
+            depth=depth, width=width, activation=activation, normalization=normalization
         )
 
         self.spp = spp(width * 16, width * 16)
@@ -1049,6 +1049,50 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         return detections, losses, hits
 
 
+class YOLOXHead(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: int,
+        anchors_per_cell: int,
+        num_classes: int,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
+
+        def linear(in_channels: int, out_channels: int) -> nn.Module:
+            return nn.Conv2d(in_channels, out_channels, kernel_size=1)
+
+        def features(num_channels: int) -> nn.Module:
+            return nn.Sequential(
+                conv(num_channels, num_channels, kernel_size=3),
+                conv(num_channels, num_channels, kernel_size=3),
+            )
+
+        def classprob(num_channels: int) -> nn.Module:
+            num_outputs = anchors_per_cell * num_classes
+            outputs = linear(num_channels, num_outputs)
+            return nn.Sequential(OrderedDict([("convs", features(num_channels)), (f"outputs_{num_outputs}", outputs)]))
+
+        self.stem = conv(in_channels, hidden_channels)
+        self.feat = features(hidden_channels)
+        self.box = linear(hidden_channels, anchors_per_cell * 4)
+        self.confidence = linear(hidden_channels, anchors_per_cell)
+        self.classprob = classprob(hidden_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        features = self.feat(x)
+        box = self.box(features)
+        confidence = self.confidence(features)
+        classprob = self.classprob(x)
+        return torch.cat((box, confidence, classprob), dim=1)
+
+
 class YOLOXNetwork(nn.Module):
     """The YOLOX network architecture. Different variants (nano/tiny/s/m/l/x) can be achieved by adjusting the
     ``depth`` and ``width`` parameters.
@@ -1076,14 +1120,14 @@ class YOLOXNetwork(nn.Module):
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
-            function that returns a tensor with as many elements as there are input boxes. Valid values for a string are
-            "iou", "giou", "diou", and "ciou" (default).
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou" (default).
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
-        class_loss_multiplier: Classification loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
             to produce coordinate values close to one.
     """
@@ -1096,7 +1140,7 @@ def __init__(
         depth: int = 3,
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
-        prior_shapes: List[Tuple[int, int]] = None,
+        prior_shapes: Optional[List[Tuple[int, int]]] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__()
@@ -1119,9 +1163,6 @@ def downsample(in_channels: int, out_channels: int) -> nn.Module:
         def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
 
-        def linear(in_channels: int, out_channels: int) -> nn.Module:
-            return nn.Conv2d(in_channels, out_channels, kernel_size=1)
-
         def csp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPBlock(
                 in_channels,
@@ -1132,17 +1173,16 @@ def csp(in_channels: int, out_channels: int) -> nn.Module:
                 activation=activation,
             )
 
-        def features(num_channels: int) -> nn.Module:
-            return nn.Sequential(
-                conv(num_channels, num_channels, kernel_size=3),
-                conv(num_channels, num_channels, kernel_size=3),
+        def head(in_channels: int, hidden_channels: int) -> YOLOXHead:
+            return YOLOXHead(
+                in_channels,
+                hidden_channels,
+                anchors_per_cell,
+                num_classes,
+                activation=activation,
+                normalization=normalization,
             )
 
-        def classprob(num_channels: int) -> nn.Module:
-            num_outputs = anchors_per_cell * num_classes
-            outputs = linear(num_channels, num_outputs)
-            return nn.Sequential(OrderedDict([("convs", features(num_channels)), (f"outputs_{num_outputs}", outputs)]))
-
         def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
             assert prior_shapes is not None
             return create_detection_layer(
@@ -1150,17 +1190,13 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
             )
 
         self.backbone = backbone or YOLOV5Backbone(
-            depth=depth, width=width, normalization=normalization, activation=activation
+            depth=depth, width=width, activation=activation, normalization=normalization
         )
 
         self.spp = spp(width * 16, width * 16)
 
         self.pan3 = csp(width * 8, width * 4)
-        self.out3_stem = conv(width * 4, width * 4)
-        self.out3_feat = features(width * 4)
-        self.out3_box = linear(width * 4, anchors_per_cell * 4)
-        self.out3_confidence = linear(width * 4, anchors_per_cell)
-        self.out3_classprob = classprob(width * 4)
+        self.out3 = head(width * 4, width * 4)
 
         self.fpn4 = nn.Sequential(
             OrderedDict(
@@ -1171,19 +1207,11 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
             )
         )
         self.pan4 = csp(width * 8, width * 8)
-        self.out4_stem = conv(width * 8, width * 4)
-        self.out4_feat = features(width * 4)
-        self.out4_box = linear(width * 4, anchors_per_cell * 4)
-        self.out4_confidence = linear(width * 4, anchors_per_cell)
-        self.out4_classprob = classprob(width * 4)
+        self.out4 = head(width * 8, width * 4)
 
         self.fpn5 = conv(width * 16, width * 8)
         self.pan5 = csp(width * 16, width * 16)
-        self.out5_stem = conv(width * 16, width * 4)
-        self.out5_feat = features(width * 4)
-        self.out5_box = linear(width * 4, anchors_per_cell * 4)
-        self.out5_confidence = linear(width * 4, anchors_per_cell)
-        self.out5_classprob = classprob(width * 4)
+        self.out5 = head(width * 16, width * 4)
 
         self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
 
@@ -1215,34 +1243,19 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         x = torch.cat((self.downsample4(n4), p5), dim=1)
         n5 = self.pan5(x)
 
-        x = self.out3_stem(n3)
-        features = self.out3_feat(x)
-        box = self.out3_box(features)
-        confidence = self.out3_confidence(features)
-        classprob = self.out3_classprob(x)
-        y = self.detect3(torch.cat((box, confidence, classprob), dim=1), image_size, targets)
+        y = self.detect3(self.out3(n3), image_size, targets)
         detections.append(y)
         if targets is not None:
             losses.append(self.detect3.losses)
             hits.append(self.detect3.hits)
 
-        x = self.out4_stem(n4)
-        features = self.out4_feat(x)
-        box = self.out4_box(features)
-        confidence = self.out4_confidence(features)
-        classprob = self.out4_classprob(x)
-        y = self.detect4(torch.cat((box, confidence, classprob), dim=1), image_size, targets)
+        y = self.detect4(self.out4(n4), image_size, targets)
         detections.append(y)
         if targets is not None:
             losses.append(self.detect4.losses)
             hits.append(self.detect4.hits)
 
-        x = self.out5_stem(n5)
-        features = self.out5_feat(x)
-        box = self.out5_box(features)
-        confidence = self.out5_confidence(features)
-        classprob = self.out5_classprob(x)
-        y = self.detect5(torch.cat((box, confidence, classprob), dim=1), image_size, targets)
+        y = self.detect5(self.out5(n5), image_size, targets)
         detections.append(y)
         if targets is not None:
             losses.append(self.detect5.losses)
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index d59f0582fd..50dc45ccc8 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -406,10 +406,6 @@ def _validate_batch(self, batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor,
 class DarknetYOLO(YOLO):
     """A subclass of YOLO that uses a Darknet configuration file and can be configured using LightningCLI.
 
-    At most one matching algorithm, ``match_sim_ota``, ``match_size_ratio``, or ``match_iou_threshold`` can be
-    specified. If none of them is given, the default algorithm is used, which matches a target to the prior shape
-    (anchor) that gives the highest IoU.
-
     CLI command::
 
         # PascalVOC using LightningCLI
@@ -419,51 +415,50 @@ class DarknetYOLO(YOLO):
 
     Args:
         network_config: Path to a Darknet configuration file that defines the network architecture.
-        match_sim_ota: If ``True``, matches a target to an anchor using the SimOTA algorithm from YOLOX.
-        match_size_ratio: If specified, matches a target to an anchor if its width and height relative to the anchor is
-            smaller than this ratio. If ``match_size_ratio`` or ``match_iou_threshold`` is not specified, selects for
-            each target the anchor with the highest IoU.
-        match_iou_threshold: If specified, matches a target to an anchor if the IoU is higher than this threshold.
-        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
-            with some target greater than this threshold, the predictor will not be taken into account when calculating
-            the confidence loss.
-        overlap_func: Which function to use for calculating the overlap between boxes. Valid values are "iou", "giou",
-            "diou", and "ciou".
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou".
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
-        class_loss_multiplier: Classification loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
     """
 
     def __init__(
         self,
         network_config: str,
         darknet_weights: Optional[str] = None,
-        match_sim_ota: bool = False,
-        match_size_ratio: Optional[float] = None,
-        match_iou_threshold: Optional[float] = None,
+        matching_algorithm: Optional[str] = None,
+        matching_threshold: Optional[float] = None,
         ignore_bg_threshold: Optional[float] = None,
         overlap_func: Optional[str] = None,
         predict_overlap: Optional[float] = None,
         overlap_loss_multiplier: Optional[float] = None,
-        class_loss_multiplier: Optional[float] = None,
         confidence_loss_multiplier: Optional[float] = None,
+        class_loss_multiplier: Optional[float] = None,
         **kwargs: Any,
     ) -> None:
         network = DarknetNetwork(
             network_config,
             darknet_weights,
-            match_sim_ota=match_sim_ota,
-            match_size_ratio=match_size_ratio,
-            match_iou_threshold=match_iou_threshold,
+            matching_algorithm=matching_algorithm,
+            matching_threshold=matching_threshold,
             ignore_bg_threshold=ignore_bg_threshold,
             overlap_func=overlap_func,
             predict_overlap=predict_overlap,
             overlap_loss_multiplier=overlap_loss_multiplier,
-            class_loss_multiplier=class_loss_multiplier,
             confidence_loss_multiplier=confidence_loss_multiplier,
+            class_loss_multiplier=class_loss_multiplier,
         )
         super().__init__(**kwargs, network=network)
 

From 17fab640fad166960178a77df661a2112dd7ba78 Mon Sep 17 00:00:00 2001
From: "Hongyeob.Kim" <kyma@HongyeobKims-MacBook-Pro.local>
Date: Mon, 19 Sep 2022 14:26:02 +0900
Subject: [PATCH 25/76] add catch_warning fixture

---
 tests/models/test_detection.py             | 21 +++++++++++--
 tests/models/yolo/unit/test_yolo_config.py | 35 +++++++++++++++++++---
 tests/models/yolo/unit/test_yolo_layers.py | 18 +++++++++--
 3 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 64b202b0ea..e4a996a014 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -2,8 +2,11 @@
 
 import pytest
 import torch
-from pytorch_lightning import Trainer
+import warnings
+
 from torch.utils.data import DataLoader
+from pytorch_lightning import Trainer
+from pytorch_lightning.utilities.warnings import PossibleUserWarning
 
 from pl_bolts.datasets import DummyDetectionDataset
 from pl_bolts.models.detection import YOLO, FasterRCNN, RetinaNet, YOLOConfiguration
@@ -96,7 +99,13 @@ def test_yolo(config):
         ("yolo_giou"),
     ],
 )
-def test_yolo_train(tmpdir, cfg_name):
+def test_yolo_train(tmpdir, cfg_name, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     config_path = Path(TEST_ROOT) / "data" / f"{cfg_name}.cfg"
     config = YOLOConfiguration(config_path)
     model = YOLO(config.get_network())
@@ -118,5 +127,11 @@ def test_yolo_train(tmpdir, cfg_name):
         )
     ],
 )
-def test_aligned_iou(dims1, dims2, expected_ious):
+def test_aligned_iou(dims1, dims2, expected_ious, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     torch.testing.assert_allclose(_aligned_iou(dims1, dims2), expected_ious)
diff --git a/tests/models/yolo/unit/test_yolo_config.py b/tests/models/yolo/unit/test_yolo_config.py
index e3a33e5a70..6e8b004e4c 100644
--- a/tests/models/yolo/unit/test_yolo_config.py
+++ b/tests/models/yolo/unit/test_yolo_config.py
@@ -1,4 +1,7 @@
 import pytest
+import warnings
+
+from pytorch_lightning.utilities.warnings import PossibleUserWarning
 
 from pl_bolts.models.detection.yolo.yolo_config import (
     _create_convolutional,
@@ -17,7 +20,13 @@
         ({"batch_normalize": 0, "filters": 4, "size": 3, "stride": 2, "pad": 0, "activation": "linear"}),
     ],
 )
-def test_create_convolutional(config):
+def test_create_convolutional(config, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     conv, _ = _create_convolutional(config, [3])
 
     assert conv.conv.out_channels == config["filters"]
@@ -57,7 +66,13 @@ def test_create_convolutional(config):
         ),
     ],
 )
-def test_create_maxpool(config):
+def test_create_maxpool(config, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     pad_size = (config["size"] - 1) // 2
     maxpool, _ = _create_maxpool(config, [3])
 
@@ -73,7 +88,13 @@ def test_create_maxpool(config):
         ({"from": 3, "activation": "linear"}),
     ],
 )
-def test_create_shortcut(config):
+def test_create_shortcut(config, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     shortcut, _ = _create_shortcut(config, [3])
 
     assert shortcut.source_layer == config["from"]
@@ -86,7 +107,13 @@ def test_create_shortcut(config):
         ({"stride": 4}),
     ],
 )
-def test_create_upsample(config):
+def test_create_upsample(config, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     upsample, _ = _create_upsample(config, [3])
 
     assert upsample.scale_factor == float(config["stride"])
diff --git a/tests/models/yolo/unit/test_yolo_layers.py b/tests/models/yolo/unit/test_yolo_layers.py
index 5b3013f6f8..6ebf320494 100644
--- a/tests/models/yolo/unit/test_yolo_layers.py
+++ b/tests/models/yolo/unit/test_yolo_layers.py
@@ -1,7 +1,9 @@
 import pytest
 import torch
+import warnings
 
 from pl_bolts.models.detection.yolo.yolo_layers import GIoULoss, IoULoss, SELoss, _corner_coordinates
+from pytorch_lightning.utilities.warnings import PossibleUserWarning
 
 
 @pytest.mark.parametrize(
@@ -11,7 +13,13 @@
         ([5.0, 5.0], [2.0, 2.0], [4.0, 4.0, 6.0, 6.0]),
     ],
 )
-def test_corner_coordinates(xy, wh, expected):
+def test_corner_coordinates(xy, wh, expected, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     xy = torch.tensor(xy)
     wh = torch.tensor(wh)
     corners = _corner_coordinates(xy, wh)
@@ -26,7 +34,13 @@ def test_corner_coordinates(xy, wh, expected):
         (SELoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 59479.0),
     ],
 )
-def test_loss_functions(loss_func, bbox1, bbox2, expected):
+def test_loss_functions(loss_func, bbox1, bbox2, expected, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     loss_func = loss_func()
     tensor1 = torch.tensor(bbox1, dtype=torch.float32)
     tensor2 = torch.tensor(bbox2, dtype=torch.float32)

From 353f1199a79cd923a720c1bd3fe476ce6d47cd05 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 19 Sep 2022 05:47:59 +0000
Subject: [PATCH 26/76] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/models/test_detection.py             | 5 ++---
 tests/models/yolo/unit/test_yolo_config.py | 2 +-
 tests/models/yolo/unit/test_yolo_layers.py | 5 +++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index e4a996a014..14be7b7803 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -1,12 +1,11 @@
+import warnings
 from pathlib import Path
 
 import pytest
 import torch
-import warnings
-
-from torch.utils.data import DataLoader
 from pytorch_lightning import Trainer
 from pytorch_lightning.utilities.warnings import PossibleUserWarning
+from torch.utils.data import DataLoader
 
 from pl_bolts.datasets import DummyDetectionDataset
 from pl_bolts.models.detection import YOLO, FasterRCNN, RetinaNet, YOLOConfiguration
diff --git a/tests/models/yolo/unit/test_yolo_config.py b/tests/models/yolo/unit/test_yolo_config.py
index 6e8b004e4c..807e2a84db 100644
--- a/tests/models/yolo/unit/test_yolo_config.py
+++ b/tests/models/yolo/unit/test_yolo_config.py
@@ -1,6 +1,6 @@
-import pytest
 import warnings
 
+import pytest
 from pytorch_lightning.utilities.warnings import PossibleUserWarning
 
 from pl_bolts.models.detection.yolo.yolo_config import (
diff --git a/tests/models/yolo/unit/test_yolo_layers.py b/tests/models/yolo/unit/test_yolo_layers.py
index 6ebf320494..02e209d400 100644
--- a/tests/models/yolo/unit/test_yolo_layers.py
+++ b/tests/models/yolo/unit/test_yolo_layers.py
@@ -1,9 +1,10 @@
+import warnings
+
 import pytest
 import torch
-import warnings
+from pytorch_lightning.utilities.warnings import PossibleUserWarning
 
 from pl_bolts.models.detection.yolo.yolo_layers import GIoULoss, IoULoss, SELoss, _corner_coordinates
-from pytorch_lightning.utilities.warnings import PossibleUserWarning
 
 
 @pytest.mark.parametrize(

From a3445acb2494ac8e9d07823242e80008d58db2b7 Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Mon, 19 Sep 2022 16:38:33 +0900
Subject: [PATCH 27/76] fix pytest error; indexing argument will be required to
 pass in upcoming release for torch.meshgrid.

---
 pl_bolts/models/detection/yolo/yolo_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/yolo_layers.py
index 011895c5bc..dda5ca2a59 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/yolo_layers.py
@@ -256,7 +256,7 @@ def _global_xy(self, xy: Tensor, image_size: Tensor) -> Tensor:
 
         x_range = torch.arange(width, device=xy.device)
         y_range = torch.arange(height, device=xy.device)
-        grid_y, grid_x = torch.meshgrid(y_range, x_range)
+        grid_y, grid_x = torch.meshgrid(y_range, x_range, indexing='ij')
         offset = torch.stack((grid_x, grid_y), -1)  # [height, width, 2]
         offset = offset.unsqueeze(2)  # [height, width, 1, 2]
 

From 189346cd508a4adaa996e3e553b11e40ce2a783c Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Mon, 19 Sep 2022 16:51:49 +0900
Subject: [PATCH 28/76] fix pytest catch_warnings; MisconfigurationException
 error

batch_size should be provided for def training_step(self, dataloader_iter)
---
 pl_bolts/models/detection/yolo/yolo_module.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index ebb494f5ef..650765421d 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -177,7 +177,7 @@ def forward(
             )
         for layer_idx, layer_hits in enumerate(hits):
             hit_rate = torch.true_divide(layer_hits, total_hits) if total_hits > 0 else 1.0
-            self.log(f"layer_{layer_idx}_hit_rate", hit_rate, sync_dist=False)
+            self.log(f"layer_{layer_idx}_hit_rate", hit_rate, sync_dist=False, batch_size=images.size(0))
 
         def total_loss(loss_name):
             """Returns the sum of the loss over detection layers."""
@@ -231,8 +231,8 @@ def validation_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], b
         total_loss = torch.stack(tuple(losses.values())).sum()
 
         for name, value in losses.items():
-            self.log(f"val/{name}_loss", value, sync_dist=True)
-        self.log("val/total_loss", total_loss, sync_dist=True)
+            self.log(f"val/{name}_loss", value, sync_dist=True, batch_size=images.size(0))
+        self.log("val/total_loss", total_loss, sync_dist=True, batch_size=images.size(0))
 
     def test_step(self, batch: Tuple[List[Tensor], List[Dict[str, Tensor]]], batch_idx: int):
         """Evaluates a batch of data from the test set.

From a1d97b672538911626eb33b1a04d7c35489911af Mon Sep 17 00:00:00 2001
From: heimish-kyma <heimish.kyma@gmail.com>
Date: Mon, 19 Sep 2022 16:54:15 +0900
Subject: [PATCH 29/76] fix pytest error

with fast_dev_run=True, it will set 1 for batch(es) of train,
however log_every_n_steps is set 50 as default this cause an error catching.
---
 tests/models/test_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index e4a996a014..e620882183 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -113,7 +113,7 @@ def test_yolo_train(tmpdir, cfg_name, catch_warnings):
     train_dl = DataLoader(DummyDetectionDataset(), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(), collate_fn=_collate_fn)
 
-    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False)
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 

From 0b4eca495f4d1135aec12b01425e1995f8470905 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 19 Sep 2022 07:56:22 +0000
Subject: [PATCH 30/76] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pl_bolts/models/detection/yolo/yolo_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/yolo_layers.py
index dda5ca2a59..f71473308a 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/yolo_layers.py
@@ -256,7 +256,7 @@ def _global_xy(self, xy: Tensor, image_size: Tensor) -> Tensor:
 
         x_range = torch.arange(width, device=xy.device)
         y_range = torch.arange(height, device=xy.device)
-        grid_y, grid_x = torch.meshgrid(y_range, x_range, indexing='ij')
+        grid_y, grid_x = torch.meshgrid(y_range, x_range, indexing="ij")
         offset = torch.stack((grid_x, grid_y), -1)  # [height, width, 2]
         offset = offset.unsqueeze(2)  # [height, width, 1, 2]
 

From eb9930ec8c36bf766ce6bc25649a46f785f1101f Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Mon, 19 Sep 2022 13:33:36 +0200
Subject: [PATCH 31/76] Fix most obvious CI failings

---
 pl_bolts/models/detection/yolo/yolo_layers.py | 7 +++++--
 pl_bolts/utils/__init__.py                    | 1 +
 tests/models/test_detection.py                | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_layers.py b/pl_bolts/models/detection/yolo/yolo_layers.py
index f71473308a..d2259520e5 100644
--- a/pl_bolts/models/detection/yolo/yolo_layers.py
+++ b/pl_bolts/models/detection/yolo/yolo_layers.py
@@ -4,7 +4,7 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import Tensor, nn
 
-from pl_bolts.utils import _TORCHVISION_AVAILABLE
+from pl_bolts.utils import _TORCH_MESHGRID_REQUIRES_INDEXING, _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
 
 if _TORCHVISION_AVAILABLE:
@@ -256,7 +256,10 @@ def _global_xy(self, xy: Tensor, image_size: Tensor) -> Tensor:
 
         x_range = torch.arange(width, device=xy.device)
         y_range = torch.arange(height, device=xy.device)
-        grid_y, grid_x = torch.meshgrid(y_range, x_range, indexing="ij")
+        if _TORCH_MESHGRID_REQUIRES_INDEXING:
+            grid_y, grid_x = torch.meshgrid(y_range, x_range, indexing="ij")
+        else:
+            grid_y, grid_x = torch.meshgrid(y_range, x_range)
         offset = torch.stack((grid_x, grid_y), -1)  # [height, width, 2]
         offset = offset.unsqueeze(2)  # [height, width, 1, 2]
 
diff --git a/pl_bolts/utils/__init__.py b/pl_bolts/utils/__init__.py
index 67907890a6..67746dc3dc 100644
--- a/pl_bolts/utils/__init__.py
+++ b/pl_bolts/utils/__init__.py
@@ -44,6 +44,7 @@ def _compare_version(package: str, op: Callable, version: str) -> bool:
 _PL_GREATER_EQUAL_1_4_5 = _compare_version("pytorch_lightning", operator.ge, "1.4.5")
 _TORCH_ORT_AVAILABLE = _module_available("torch_ort")
 _TORCH_MAX_VERSION_SPARSEML = _compare_version("torch", operator.lt, "1.11.0")
+_TORCH_MESHGRID_REQUIRES_INDEXING = _compare_version("torch", operator.ge, "1.10.0")
 _SPARSEML_AVAILABLE = _module_available("sparseml") and _PL_GREATER_EQUAL_1_4_5 and _TORCH_MAX_VERSION_SPARSEML
 
 __all__ = ["BatchGradientVerification"]
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index f4861fa2f2..de0ba23696 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -133,4 +133,4 @@ def test_aligned_iou(dims1, dims2, expected_ious, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    torch.testing.assert_allclose(_aligned_iou(dims1, dims2), expected_ious)
+    torch.testing.assert_close(_aligned_iou(dims1, dims2), expected_ious)

From fdf38fbecfe5e0b1db0d7a085aa71d37f4dd9568 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Mon, 19 Sep 2022 14:01:46 +0200
Subject: [PATCH 32/76] fix test with a missing warning

---
 tests/models/test_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index de0ba23696..1c3e2f0b14 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -113,7 +113,7 @@ def test_yolo_train(tmpdir, cfg_name, catch_warnings):
     valid_dl = DataLoader(DummyDetectionDataset(), collate_fn=_collate_fn)
 
     trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False)
-    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl, max_expochs=10)
 
 
 @pytest.mark.parametrize(

From 28813acea97f11a0cfed6f9cff4a12fec88f4195 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 21 Sep 2022 17:35:13 +0300
Subject: [PATCH 33/76] Refactoring

---
 pl_bolts/models/detection/yolo/yolo_module.py | 88 ++++++++++---------
 1 file changed, 48 insertions(+), 40 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index e83ab08ab9..85a51b917e 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -34,6 +34,47 @@
     warn_missing_pkg("torchvision")
 
 
+def validate_batch(batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS]:
+    """Reads a batch of data, validates the format, and stacks the images into a single tensor.
+
+    Args:
+        batch: The batch of data read by the :class:`~torch.utils.data.DataLoader`.
+
+    Returns:
+        The input batch with images stacked into a single tensor.
+    """
+    images, targets = batch
+
+    if not images:
+        raise ValueError("No images in batch.")
+
+    if len(images) != len(targets):
+        raise ValueError(f"Got {len(images)} images, but targets for {len(targets)} images.")
+
+    shape = images[0].shape
+    for image in images:
+        if not isinstance(image, Tensor):
+            raise ValueError(f"Expected image to be of type Tensor, got {type(image).__name__}.")
+        if image.shape != shape:
+            raise ValueError(f"Images with different shapes in one batch: {shape} and {image.shape}")
+
+    for target in targets:
+        boxes = target["boxes"]
+        if not isinstance(boxes, Tensor):
+            raise ValueError(f"Expected target boxes to be of type Tensor, got {type(boxes).__name__}.")
+        if (boxes.ndim != 2) or (boxes.shape[-1] != 4):
+            raise ValueError(f"Expected target boxes to be tensors of shape [N, 4], got {list(boxes.shape)}.")
+        labels = target["labels"]
+        if not isinstance(labels, Tensor):
+            raise ValueError(f"Expected target labels to be of type Tensor, got {type(labels).__name__}.")
+        if (labels.ndim < 1) or (labels.ndim > 2) or (len(labels) != len(boxes)):
+            raise ValueError(
+                f"Expected target labels to be tensors of shape [N] or [N, num_classes], got {list(labels.shape)}."
+            )
+
+    return torch.stack(images), targets
+
+
 @under_review()
 class YOLO(LightningModule):
     """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4,
@@ -174,7 +215,9 @@ def configure_optimizers(self) -> Tuple[List[optim.Optimizer], List[optim.lr_sch
             default_group = []
             wd_group = []
             for name, tensor in self.named_parameters():
-                if name.endswith(".conv.weight"):
+                if not tensor.requires_grad:
+                    continue
+                elif name.endswith(".conv.weight"):
                     wd_group.append(tensor)
                 else:
                     default_group.append(tensor)
@@ -200,7 +243,7 @@ def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) ->
         Returns:
             A dictionary that includes the training loss in 'loss'.
         """
-        images, targets = self._validate_batch(batch)
+        images, targets = validate_batch(batch)
         _, losses = self(images, targets)
 
         # sync_dist=True is broken in some versions of Lightning and may cause the sum of the loss
@@ -222,7 +265,7 @@ def validation_step(  # type: ignore
                 dictionaries.
             batch_idx: Index of the current batch.
         """
-        images, targets = self._validate_batch(batch)
+        images, targets = validate_batch(batch)
         detections, losses = self(images, targets)
 
         self.log("val/overlap_loss", losses[0], sync_dist=True)
@@ -250,7 +293,7 @@ def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Opti
                 dictionaries.
             batch_idx: Index of the current batch.
         """
-        images, targets = self._validate_batch(batch)
+        images, targets = validate_batch(batch)
         detections, losses = self(images, targets)
 
         self.log("test/overlap_loss", losses[0], sync_dist=True)
@@ -287,7 +330,7 @@ class labels.
             bounding box `(x1, y1, x2, y2)` coordinates. "scores" is a vector of confidence scores for the bounding box
             detections. "labels" is a vector of predicted class labels.
         """
-        images, _ = self._validate_batch(batch)
+        images, _ = validate_batch(batch)
         detections = self(images)
         detections = self.process_detections(detections)
 
@@ -369,41 +412,6 @@ def process(boxes: Tensor, labels: Tensor, **other: Any) -> Dict[str, Any]:
 
         return [process(**t) for t in targets]
 
-    def _validate_batch(self, batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS]:
-        """Reads a batch of data, validates the format, and stacks the images into a single tensor.
-
-        Args:
-            batch: The batch of data read by the :class:`~torch.utils.data.DataLoader`.
-
-        Returns:
-            The input batch with images stacked into a single tensor.
-        """
-        images, targets = batch
-
-        if len(images) != len(targets):
-            raise ValueError(f"Got {len(images)} images, but targets for {len(targets)} images.")
-
-        for image in images:
-            if not isinstance(image, Tensor):
-                raise ValueError(f"Expected image to be of type Tensor, got {type(image)}.")
-
-        for target in targets:
-            boxes = target["boxes"]
-            if not isinstance(boxes, Tensor):
-                raise ValueError(f"Expected target boxes to be of type Tensor, got {type(boxes)}.")
-            if (boxes.ndim != 2) or (boxes.shape[-1] != 4):
-                raise ValueError(f"Expected target boxes to be tensors of shape [N, 4], got {list(boxes.shape)}.")
-            labels = target["labels"]
-            if not isinstance(labels, Tensor):
-                raise ValueError(f"Expected target labels to be of type Tensor, got {type(labels)}.")
-            if (labels.ndim < 1) or (labels.ndim > 2) or (len(labels) != len(boxes)):
-                raise ValueError(
-                    f"Expected target labels to be tensors of shape [N] or [N, num_classes], got {list(labels.shape)}."
-                )
-
-        images = torch.stack(images)
-        return images, targets
-
 
 class DarknetYOLO(YOLO):
     """A subclass of YOLO that uses a Darknet configuration file and can be configured using LightningCLI.

From a42cdec9120a1f5b874f771196cc4369c65b07a8 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Wed, 21 Sep 2022 17:20:39 +0200
Subject: [PATCH 34/76] resolve accidentally introduced errors

---
 tests/models/test_detection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 1c3e2f0b14..899ec5c393 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -112,8 +112,8 @@ def test_yolo_train(tmpdir, cfg_name, catch_warnings):
     train_dl = DataLoader(DummyDetectionDataset(), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(), collate_fn=_collate_fn)
 
-    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False)
-    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl, max_expochs=10)
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False, max_epochs=10, accelerator="auto")
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
 @pytest.mark.parametrize(

From 1d324e5bc0ec031e0beeb2b93662b44c7faecca0 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 6 Oct 2022 13:04:16 +0300
Subject: [PATCH 35/76] infer() returns the model to the previous mode

---
 .../models/detection/yolo/darknet_network.py  |  2 +-
 pl_bolts/models/detection/yolo/layers.py      |  4 +--
 pl_bolts/models/detection/yolo/yolo_module.py | 34 ++++++++++++++-----
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index 52765a7489..08eaa93e25 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -99,7 +99,7 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
         for layer in self.layers:
             if isinstance(layer, (layers.RouteLayer, layers.ShortcutLayer)):
-                x = layer(x, outputs)
+                x = layer(outputs)
             elif isinstance(layer, layers.DetectionLayer):
                 x = layer(x, image_size, targets)
                 detections.append(x)
diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index cfd0590140..2170b3098a 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -289,7 +289,7 @@ def __init__(self, source_layers: List[int], num_chunks: int, chunk_idx: int) ->
         self.num_chunks = num_chunks
         self.chunk_idx = chunk_idx
 
-    def forward(self, x: Tensor, outputs: List[Tensor]) -> Tensor:
+    def forward(self, outputs: List[Tensor]) -> Tensor:
         chunks = [torch.chunk(outputs[layer], self.num_chunks, dim=1)[self.chunk_idx] for layer in self.source_layers]
         return torch.cat(chunks, dim=1)
 
@@ -305,7 +305,7 @@ def __init__(self, source_layer: int) -> None:
         super().__init__()
         self.source_layer = source_layer
 
-    def forward(self, x: Tensor, outputs: List[Tensor]) -> Tensor:
+    def forward(self, outputs: List[Tensor]) -> Tensor:
         return outputs[-1] + outputs[self.source_layer]
 
 
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 85a51b917e..fd8d90da58 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -141,9 +141,9 @@ def __init__(
         self,
         network: nn.Module,
         optimizer: Type[optim.Optimizer] = optim.SGD,
-        optimizer_params: Dict[str, Any] = {"lr": 0.01, "momentum": 0.9, "weight_decay": 0.0005},
+        optimizer_params: Optional[Dict[str, Any]] = None,
         lr_scheduler: Type[optim.lr_scheduler._LRScheduler] = LinearWarmupCosineAnnealingLR,
-        lr_scheduler_params: Dict[str, Any] = {"warmup_epochs": 5, "max_epochs": 300, "warmup_start_lr": 0.0},
+        lr_scheduler_params: Optional[Dict[str, Any]] = None,
         confidence_threshold: float = 0.2,
         nms_threshold: float = 0.45,
         detections_per_image: int = 300,
@@ -155,9 +155,15 @@ def __init__(
 
         self.network = network
         self.optimizer_class = optimizer
-        self.optimizer_params = optimizer_params
+        if optimizer_params is not None:
+            self.optimizer_params = optimizer_params
+        else:
+            self.optimizer_params = {"lr": 0.01, "momentum": 0.9, "weight_decay": 0.0005}
         self.lr_scheduler_class = lr_scheduler
-        self.lr_scheduler_params = lr_scheduler_params
+        if lr_scheduler_params is not None:
+            self.lr_scheduler_params = lr_scheduler_params
+        else:
+            self.lr_scheduler_params = {"warmup_epochs": 5, "max_epochs": 300, "warmup_start_lr": 0.0}
         self.confidence_threshold = confidence_threshold
         self.nms_threshold = nms_threshold
         self.detections_per_image = detections_per_image
@@ -351,10 +357,16 @@ def infer(self, image: Tensor) -> Dict[str, Tensor]:
         if not isinstance(image, torch.Tensor):
             image = F.to_tensor(image)
 
+        was_training = self.training
         self.eval()
+
         detections = self(image.unsqueeze(0))
         detections = self.process_detections(detections)
-        return detections[0]
+        detections = detections[0]
+
+        if was_training:
+            self.train()
+        return detections
 
     def process_detections(self, preds: Tensor) -> List[Dict[str, Tensor]]:
         """Splits the detection tensor returned by a forward pass into a list of prediction dictionaries, and
@@ -418,10 +430,16 @@ class DarknetYOLO(YOLO):
 
     CLI command::
 
-        # PascalVOC using LightningCLI
         wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny-3l.cfg
-        python yolo_module.py fit --model.network_config yolov4-tiny-3l.cfg --data.batch_size 8 --trainer.gpus 8 \
-            --trainer.accumulate_grad_batches 2
+        python yolo_module.py fit \
+            --model.network_config yolov4-tiny-3l.cfg \
+            --data.batch_size 8 \
+            --data.num_workers 4 \
+            --trainer.accelerator gpu \
+            --trainer.devices 8 \
+            --trainer.accumulate_grad_batches 2 \
+            --trainer.gradient_clip_val 5.0 \
+            --trainer.max_epochs=100
 
     Args:
         network_config: Path to a Darknet configuration file that defines the network architecture.

From 374a3ecedf2221eb3864018595a0772022c53574 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 6 Oct 2022 16:42:15 +0300
Subject: [PATCH 36/76] CLI YOLO application uses the YOLOv4 architecture, if a
 Darknet configuration files is not provided

---
 pl_bolts/models/detection/yolo/loss.py        |  7 +-
 pl_bolts/models/detection/yolo/yolo_module.py | 82 ++++++++++++++-----
 2 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index e00c73aea1..0a09393b14 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -182,7 +182,7 @@ def __init__(
         self,
         overlap_func: Union[str, Callable] = "ciou",
         predict_overlap: Optional[float] = None,
-        overlap_multiplier: float = 1.0,
+        overlap_multiplier: float = 5.0,
         confidence_multiplier: float = 1.0,
         class_multiplier: float = 1.0,
     ):
@@ -200,7 +200,10 @@ def __init__(
             overlap_loss_func = complete_box_iou_loss
 
         if not callable(overlap_func):
-            raise ValueError("Unsupported overlap function. Try upgrading Torcvision or using another IoU algorithm.")
+            raise ValueError(
+                f"Unsupported overlap function '{overlap_func}'. Try upgrading Torcvision or using another IoU "
+                "algorithm."
+            )
         self._pairwise_overlap = overlap_func
 
         if callable(overlap_loss_func):
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index fd8d90da58..522b364b11 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -4,13 +4,14 @@
 import torch
 import torch.nn as nn
 from pytorch_lightning import LightningModule
-from pytorch_lightning.utilities.cli import LightningCLI
+from pytorch_lightning.cli import LightningCLI
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
 from torch import Tensor, optim
 
 from pl_bolts.datamodules import VOCDetectionDataModule
 from pl_bolts.datamodules.vocdetection_datamodule import Compose
 from pl_bolts.models.detection.yolo.darknet_network import DarknetNetwork
+from pl_bolts.models.detection.yolo.torch_networks import YOLOV4Network
 from pl_bolts.models.detection.yolo.types import TARGET, TARGETS
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
 from pl_bolts.utils import _TORCHMETRICS_DETECTION_AVAILABLE, _TORCHVISION_AVAILABLE
@@ -425,11 +426,15 @@ def process(boxes: Tensor, labels: Tensor, **other: Any) -> Dict[str, Any]:
         return [process(**t) for t in targets]
 
 
-class DarknetYOLO(YOLO):
-    """A subclass of YOLO that uses a Darknet configuration file and can be configured using LightningCLI.
+class CLIYOLO(YOLO):
+    """A subclass of YOLO that can be easily configured using LightningCLI.
+
+    Either loads a Darknet configuration file, or constructs a YOLOv4 network. This is just an example of how to use the
+    model. Various other network architectures from ``torch_networks.py`` can be used.
 
     CLI command::
 
+        # Darknet network configuration
         wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny-3l.cfg
         python yolo_module.py fit \
             --model.network_config yolov4-tiny-3l.cfg \
@@ -441,8 +446,19 @@ class DarknetYOLO(YOLO):
             --trainer.gradient_clip_val 5.0 \
             --trainer.max_epochs=100
 
+        # YOLOv4
+        python yolo_module.py fit \
+            --data.batch_size 8 \
+            --data.num_workers 4 \
+            --trainer.accelerator gpu \
+            --trainer.devices 8 \
+            --trainer.accumulate_grad_batches 2 \
+            --trainer.gradient_clip_val 5.0 \
+            --trainer.max_epochs=100
+
     Args:
-        network_config: Path to a Darknet configuration file that defines the network architecture.
+        network_config: Path to a Darknet configuration file that defines the network architecture. If not given, a
+            YOLOv4 network will be constructed.
         matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
             from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
@@ -451,9 +467,8 @@ class DarknetYOLO(YOLO):
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
-        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
-            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
-            "ciou".
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Valid values are
+            "iou", "giou", "diou", and "ciou".
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
@@ -464,7 +479,7 @@ class DarknetYOLO(YOLO):
 
     def __init__(
         self,
-        network_config: str,
+        network_config: Optional[str] = None,
         darknet_weights: Optional[str] = None,
         matching_algorithm: Optional[str] = None,
         matching_threshold: Optional[float] = None,
@@ -476,18 +491,43 @@ def __init__(
         class_loss_multiplier: Optional[float] = None,
         **kwargs: Any,
     ) -> None:
-        network = DarknetNetwork(
-            network_config,
-            darknet_weights,
-            matching_algorithm=matching_algorithm,
-            matching_threshold=matching_threshold,
-            ignore_bg_threshold=ignore_bg_threshold,
-            overlap_func=overlap_func,
-            predict_overlap=predict_overlap,
-            overlap_loss_multiplier=overlap_loss_multiplier,
-            confidence_loss_multiplier=confidence_loss_multiplier,
-            class_loss_multiplier=class_loss_multiplier,
-        )
+        if network_config is not None:
+            network = DarknetNetwork(
+                network_config,
+                darknet_weights,
+                matching_algorithm=matching_algorithm,
+                matching_threshold=matching_threshold,
+                ignore_bg_threshold=ignore_bg_threshold,
+                overlap_func=overlap_func,
+                predict_overlap=predict_overlap,
+                overlap_loss_multiplier=overlap_loss_multiplier,
+                confidence_loss_multiplier=confidence_loss_multiplier,
+                class_loss_multiplier=class_loss_multiplier,
+            )
+        else:
+            # We need to set some defaults, since we don't get the default values from a configuration file.
+            if ignore_bg_threshold is None:
+                ignore_bg_threshold = 0.7
+            if overlap_func is None:
+                overlap_func = "ciou"
+            if overlap_loss_multiplier is None:
+                overlap_loss_multiplier = 5.0
+            if confidence_loss_multiplier is None:
+                confidence_loss_multiplier = 1.0
+            if class_loss_multiplier is None:
+                class_loss_multiplier = 1.0
+
+            network = YOLOV4Network(
+                num_classes=21,  # The number of classes in Pascal VOC dataset.
+                matching_algorithm=matching_algorithm,
+                matching_threshold=matching_threshold,
+                ignore_bg_threshold=ignore_bg_threshold,
+                overlap_func=overlap_func,
+                predict_overlap=predict_overlap,
+                overlap_loss_multiplier=overlap_loss_multiplier,
+                confidence_loss_multiplier=confidence_loss_multiplier,
+                class_loss_multiplier=class_loss_multiplier,
+            )
         super().__init__(**kwargs, network=network)
 
 
@@ -539,4 +579,4 @@ def _resize(self, image: Tensor, target: TARGET) -> Tuple[Tensor, TARGET]:
 
 
 if __name__ == "__main__":
-    LightningCLI(DarknetYOLO, ResizedVOCDetectionDataModule, seed_everything_default=42)
+    LightningCLI(CLIYOLO, ResizedVOCDetectionDataModule, seed_everything_default=42)

From 737ec64cb2910bb4c058595a21b39918d1d3d400 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 6 Oct 2022 17:20:50 +0300
Subject: [PATCH 37/76] Minor documentation improvements

---
 pl_bolts/models/detection/yolo/yolo_module.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 522b364b11..2fc6a73d45 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -94,7 +94,9 @@ class YOLO(LightningModule):
 
     The network architecture can be written in PyTorch, or read from a Darknet configuration file using the
     :class:`~pl_bolts.models.detection.yolo.darknet_network.DarknetNetwork` class. ``DarknetNetwork`` is also able to
-    read weights from a Darknet model file.
+    read weights from a Darknet model file. See the CLI application and the
+    :class:`~pl_bolts.models.detection.yolo.yolo_module.CLIYOLO` class for an example of how to specify a network
+    architecture.
 
     The input from the data loader is expected to be a list of images. Each image is a tensor with shape
     ``[channels, height, width]``. The images from a single batch will be stacked into a single tensor, so the sizes
@@ -430,7 +432,9 @@ class CLIYOLO(YOLO):
     """A subclass of YOLO that can be easily configured using LightningCLI.
 
     Either loads a Darknet configuration file, or constructs a YOLOv4 network. This is just an example of how to use the
-    model. Various other network architectures from ``torch_networks.py`` can be used.
+    model. Various other network architectures from ``torch_networks.py`` can be used. Note that if you change the
+    resolution of the input images, you should also scale the prior shapes (a.k.a. anchors). They are specified in the
+    Darknet configuration file or provided in the network constructor parameters.
 
     CLI command::
 

From d534cfaa942cd8f296191cb184c41576aa39a3f6 Mon Sep 17 00:00:00 2001
From: otaj <ota@lightning.ai>
Date: Tue, 11 Oct 2022 11:14:03 +0200
Subject: [PATCH 38/76] add catch_warnings

---
 tests/models/test_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 899ec5c393..b451ddfde7 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -82,7 +82,7 @@ def test_fasterrcnn_pyt_module_bbone_train(tmpdir):
 
 
 @pytest.mark.parametrize("config", [("yolo"), ("yolo_giou")])
-def test_yolo(config):
+def test_yolo(config, catch_warnings):
     config_path = Path(TEST_ROOT) / "data" / f"{config}.cfg"
     config = YOLOConfiguration(config_path)
     model = YOLO(config.get_network())

From 7bd8d340d7cdfa5445d72ce35c342fbfaa51fb31 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 12 Oct 2022 09:44:12 +0300
Subject: [PATCH 39/76] Fixed a typo

Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>
---
 pl_bolts/models/detection/yolo/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index 0a09393b14..37227085d7 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -201,7 +201,7 @@ def __init__(
 
         if not callable(overlap_func):
             raise ValueError(
-                f"Unsupported overlap function '{overlap_func}'. Try upgrading Torcvision or using another IoU "
+                f"Unsupported overlap function '{overlap_func}'. Try upgrading Torchvision or using another IoU "
                 "algorithm."
             )
         self._pairwise_overlap = overlap_func

From ddc4a46d04fa6c5079724758e751961fdf7972ce Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 12 Oct 2022 12:43:38 +0300
Subject: [PATCH 40/76] Fixed unit tests and added catch_warnings to all tests.

---
 tests/models/test_detection.py                | 186 +++++-------------
 ...yolo_config.py => test_darknet_network.py} |  30 +--
 .../models/yolo/unit/test_target_matching.py  |  21 ++
 tests/models/yolo/unit/test_utils.py          | 115 +++++++++++
 tests/models/yolo/unit/test_yolo_layers.py    |  51 -----
 5 files changed, 199 insertions(+), 204 deletions(-)
 rename tests/models/yolo/unit/{test_yolo_config.py => test_darknet_network.py} (76%)
 create mode 100644 tests/models/yolo/unit/test_target_matching.py
 create mode 100644 tests/models/yolo/unit/test_utils.py
 delete mode 100644 tests/models/yolo/unit/test_yolo_layers.py

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index b1cc364e40..574bd82f83 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -20,15 +20,6 @@
     YOLOXNetwork,
 )
 from pl_bolts.models.detection.faster_rcnn import create_fasterrcnn_backbone
-from pl_bolts.models.detection.yolo.target_matching import _sim_ota_match
-from pl_bolts.models.detection.yolo.utils import (
-    aligned_iou,
-    global_xy,
-    grid_centers,
-    grid_offsets,
-    iou_below,
-    is_inside_box,
-)
 from tests import TEST_ROOT
 
 
@@ -99,125 +90,6 @@ def test_fasterrcnn_pyt_module_bbone_train(tmpdir):
     trainer.fit(model, train_dl, valid_dl)
 
 
-@pytest.mark.parametrize("width,height", [(10, 5)])
-def test_grid_offsets(width: int, height: int):
-    size = torch.tensor([width, height])
-    offsets = grid_offsets(size)
-    assert offsets.shape == (height, width, 2)
-    assert torch.equal(offsets[0, :, 0], torch.arange(width, dtype=offsets.dtype))
-    assert torch.equal(offsets[0, :, 1], torch.zeros(width, dtype=offsets.dtype))
-    assert torch.equal(offsets[:, 0, 0], torch.zeros(height, dtype=offsets.dtype))
-    assert torch.equal(offsets[:, 0, 1], torch.arange(height, dtype=offsets.dtype))
-
-
-@pytest.mark.parametrize("width,height", [(10, 5)])
-def test_grid_centers(width: int, height: int):
-    size = torch.tensor([width, height])
-    centers = grid_centers(size)
-    assert centers.shape == (height, width, 2)
-    assert torch.equal(centers[0, :, 0], 0.5 + torch.arange(width, dtype=torch.float))
-    assert torch.equal(centers[0, :, 1], 0.5 * torch.ones(width))
-    assert torch.equal(centers[:, 0, 0], 0.5 * torch.ones(height))
-    assert torch.equal(centers[:, 0, 1], 0.5 + torch.arange(height, dtype=torch.float))
-
-
-def test_global_xy():
-    xy = torch.ones((2, 4, 4, 3, 2)) * 0.5  # 4x4 grid of coordinates to the center of the cell.
-    image_size = torch.tensor([400, 200])
-    xy = global_xy(xy, image_size)
-    assert xy.shape == (2, 4, 4, 3, 2)
-    assert torch.all(xy[:, :, 0, :, 0] == 50)
-    assert torch.all(xy[:, 0, :, :, 1] == 25)
-    assert torch.all(xy[:, :, 1, :, 0] == 150)
-    assert torch.all(xy[:, 1, :, :, 1] == 75)
-    assert torch.all(xy[:, :, 2, :, 0] == 250)
-    assert torch.all(xy[:, 2, :, :, 1] == 125)
-    assert torch.all(xy[:, :, 3, :, 0] == 350)
-    assert torch.all(xy[:, 3, :, :, 1] == 175)
-
-
-def test_is_inside_box():
-    """
-    centers:
-        [[1,1; 3,1; 5,1; 7,1; 9,1; 11,1; 13,1; 15,1; 17,1; 19,1]
-         [1,3; 3,3; 5,3; 7,3; 9,3; 11,3; 13,3; 15,3; 17,3; 19,3]
-         [1,5; 3,5; 5,5; 7,5; 9,5; 11,5; 13,5; 15,5; 17,5; 19,5]
-         [1,7; 3,7; 5,7; 7,7; 9,7; 11,7; 13,7; 15,7; 17,7; 19,7]
-         [1,9; 3,9; 5,9; 7,9; 9,9; 11,9; 13,9; 15,9; 17,9; 19,9]]
-
-    is_inside[0]:
-        [[F, F, F, F, F, F, F, F, F, F]
-         [F, T, T, F, F, F, F, F, F, F]
-         [F, T, T, F, F, F, F, F, F, F]
-         [F, F, F, F, F, F, F, F, F, F]
-         [F, F, F, F, F, F, F, F, F, F]]
-
-    is_inside[1]:
-        [[F, F, F, F, F, F, F, F, F, F]
-         [F, F, F, F, F, F, F, F, F, F]
-         [F, F, F, F, F, F, F, F, F, F]
-         [F, F, F, F, F, F, F, F, F, F]
-         [F, F, F, F, F, F, F, T, T, F]]
-    """
-    size = torch.tensor([10, 5])
-    centers = grid_centers(size) * 2.0
-    centers = centers.view(-1, 2)
-    boxes = torch.tensor([[2, 2, 6, 6], [14, 8, 18, 10]])
-    is_inside = is_inside_box(centers, boxes).view(2, 5, 10)
-    assert torch.count_nonzero(is_inside) == 6
-    assert torch.all(is_inside[0, 1:3, 1:3])
-    assert torch.all(is_inside[1, 4, 7:9])
-
-
-def test_sim_ota_match():
-    # IoUs will determined that 2 and 1 predictions will be selected for the first and the second target.
-    ious = torch.tensor([[0.1, 0.1, 0.9, 0.9], [0.2, 0.3, 0.4, 0.1]])
-    # Costs will determine that the first and the last prediction will be selected for the first target, and the first
-    # prediction will be selected for the second target. Since the first prediction was selected for both targets, it
-    # will be matched to the best target only (the second one).
-    costs = torch.tensor([[0.3, 0.5, 0.4, 0.3], [0.1, 0.2, 0.5, 0.3]])
-    matched_preds, matched_targets = _sim_ota_match(costs, ious)
-    assert len(matched_preds) == 4
-    assert matched_preds[0]
-    assert not matched_preds[1]
-    assert not matched_preds[2]
-    assert matched_preds[3]
-    assert len(matched_targets) == 2  # Two predictions were matched.
-    assert matched_targets[0] == 1  # Which target was matched to the first prediction.
-    assert matched_targets[1] == 0  # Which target was matched to the last prediction.
-
-
-@pytest.mark.parametrize(
-    "dims1, dims2, expected_ious",
-    [
-        (
-            torch.tensor([[1.0, 1.0], [10.0, 1.0], [100.0, 10.0]]),
-            torch.tensor([[1.0, 10.0], [2.0, 20.0]]),
-            torch.tensor([[1.0 / 10.0, 1.0 / 40.0], [1.0 / 19.0, 2.0 / 48.0], [10.0 / 1000.0, 20.0 / 1020.0]]),
-        )
-    ],
-)
-def test_aligned_iou(dims1, dims2, expected_ious, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    torch.testing.assert_close(aligned_iou(dims1, dims2), expected_ious)
-
-
-def test_iou_below():
-    tl = torch.rand((10, 10, 3, 2)) * 100
-    br = tl + 10
-    pred_boxes = torch.cat((tl, br), -1)
-    target_boxes = torch.stack((pred_boxes[1, 1, 0], pred_boxes[3, 5, 1]))
-    result = iou_below(pred_boxes, target_boxes, 0.9)
-    assert result.shape == (10, 10, 3)
-    assert not result[1, 1, 0]
-    assert not result[3, 5, 1]
-
-
 @pytest.mark.parametrize("config", [("yolo"), ("yolo_giou")])
 def test_darknet(config, catch_warnings):
     config_path = Path(TEST_ROOT) / "data" / f"{config}.cfg"
@@ -253,7 +125,7 @@ def test_darknet_train(tmpdir, cfg_name, catch_warnings):
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
-def test_yolov4_tiny(tmpdir):
+def test_yolov4_tiny(catch_warnings):
     network = YOLOV4TinyNetwork(num_classes=2, width=4)
     model = YOLO(network)
 
@@ -262,17 +134,23 @@ def test_yolov4_tiny(tmpdir):
 
 
 def test_yolov4_tiny_train(tmpdir):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     network = YOLOV4TinyNetwork(num_classes=2, width=4)
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
 
-    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False, max_epochs=10, accelerator="auto")
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
-def test_yolov4(tmpdir):
+def test_yolov4(catch_warnings):
     network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
     model = YOLO(network)
 
@@ -280,18 +158,24 @@ def test_yolov4(tmpdir):
     model(image)
 
 
-def test_yolov4_train(tmpdir):
+def test_yolov4_train(tmpdir, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
 
-    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False, max_epochs=10, accelerator="auto")
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
-def test_yolov4p6(tmpdir):
+def test_yolov4p6(catch_warnings):
     network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
     model = YOLO(network)
 
@@ -299,18 +183,24 @@ def test_yolov4p6(tmpdir):
     model(image)
 
 
-def test_yolov4p6_train(tmpdir):
+def test_yolov4p6_train(tmpdir, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
 
-    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False, max_epochs=10, accelerator="auto")
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
-def test_yolov5(tmpdir):
+def test_yolov5(catch_warnings):
     network = YOLOV5Network(num_classes=2, depth=1, width=4)
     model = YOLO(network)
 
@@ -318,18 +208,24 @@ def test_yolov5(tmpdir):
     model(image)
 
 
-def test_yolov5_train(tmpdir):
+def test_yolov5_train(tmpdir, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     network = YOLOV5Network(num_classes=2, depth=1, width=4)
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
 
-    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False, max_epochs=10, accelerator="auto")
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
-def test_yolox(tmpdir):
+def test_yolox(catch_warnings):
     network = YOLOXNetwork(num_classes=2, depth=1, width=4)
     model = YOLO(network)
 
@@ -337,12 +233,18 @@ def test_yolox(tmpdir):
     model(image)
 
 
-def test_yolox_train(tmpdir):
+def test_yolox_train(tmpdir, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
     network = YOLOXNetwork(num_classes=2, depth=1, width=4)
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
 
-    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir)
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False, max_epochs=10, accelerator="auto")
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
diff --git a/tests/models/yolo/unit/test_yolo_config.py b/tests/models/yolo/unit/test_darknet_network.py
similarity index 76%
rename from tests/models/yolo/unit/test_yolo_config.py
rename to tests/models/yolo/unit/test_darknet_network.py
index 807e2a84db..0fd45a1a42 100644
--- a/tests/models/yolo/unit/test_yolo_config.py
+++ b/tests/models/yolo/unit/test_darknet_network.py
@@ -1,9 +1,10 @@
 import warnings
 
 import pytest
+import torch.nn as nn
 from pytorch_lightning.utilities.warnings import PossibleUserWarning
 
-from pl_bolts.models.detection.yolo.yolo_config import (
+from pl_bolts.models.detection.yolo.darknet_network import (
     _create_convolutional,
     _create_maxpool,
     _create_shortcut,
@@ -40,13 +41,14 @@ def test_create_convolutional(config, catch_warnings):
         assert conv.conv.padding == (pad_size, pad_size)
 
     if config["batch_normalize"]:
-        assert len(conv) == 3
+        assert isinstance(conv.norm, nn.BatchNorm2d)
 
-    if activation != "linear":
-        if activation != "logistic":
-            assert activation == conv[-1].__class__.__name__.lower()[: len(activation)]
-        elif activation == "logistic":
-            assert "sigmoid" == conv[-1].__class__.__name__.lower()
+    if activation == "linear":
+        assert isinstance(conv.act, nn.Identity)
+    elif activation == "logistic":
+        assert isinstance(conv.act, nn.Sigmoid)
+    else:
+        assert conv.act.__class__.__name__.lower().startswith(activation)
 
 
 @pytest.mark.parametrize(
@@ -73,12 +75,18 @@ def test_create_maxpool(config, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    pad_size = (config["size"] - 1) // 2
+    #    print("size", config["size"])
+    #    print("stride", )
+    pad_size, remainder = divmod(max(config["size"], config["stride"]) - config["stride"], 2)
+    #    print("pad_size", pad_size)
     maxpool, _ = _create_maxpool(config, [3])
+    #    print("maxpool.maxpool.padding", maxpool.maxpool.padding)
 
-    assert maxpool.kernel_size == config["size"]
-    assert maxpool.stride == config["stride"]
-    assert maxpool.padding == pad_size
+    assert maxpool.maxpool.kernel_size == config["size"]
+    assert maxpool.maxpool.stride == config["stride"]
+    assert maxpool.maxpool.padding == pad_size
+    if remainder != 0:
+        assert isinstance(maxpool.pad, nn.ZeroPad2d)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/models/yolo/unit/test_target_matching.py b/tests/models/yolo/unit/test_target_matching.py
new file mode 100644
index 0000000000..07f00ad751
--- /dev/null
+++ b/tests/models/yolo/unit/test_target_matching.py
@@ -0,0 +1,21 @@
+import torch
+
+from pl_bolts.models.detection.yolo.target_matching import _sim_ota_match
+
+
+def test_sim_ota_match(catch_warnings):
+    # IoUs will determined that 2 and 1 predictions will be selected for the first and the second target.
+    ious = torch.tensor([[0.1, 0.1, 0.9, 0.9], [0.2, 0.3, 0.4, 0.1]])
+    # Costs will determine that the first and the last prediction will be selected for the first target, and the first
+    # prediction will be selected for the second target. Since the first prediction was selected for both targets, it
+    # will be matched to the best target only (the second one).
+    costs = torch.tensor([[0.3, 0.5, 0.4, 0.3], [0.1, 0.2, 0.5, 0.3]])
+    matched_preds, matched_targets = _sim_ota_match(costs, ious)
+    assert len(matched_preds) == 4
+    assert matched_preds[0]
+    assert not matched_preds[1]
+    assert not matched_preds[2]
+    assert matched_preds[3]
+    assert len(matched_targets) == 2  # Two predictions were matched.
+    assert matched_targets[0] == 1  # Which target was matched to the first prediction.
+    assert matched_targets[1] == 0  # Which target was matched to the last prediction.
diff --git a/tests/models/yolo/unit/test_utils.py b/tests/models/yolo/unit/test_utils.py
new file mode 100644
index 0000000000..fa4121a191
--- /dev/null
+++ b/tests/models/yolo/unit/test_utils.py
@@ -0,0 +1,115 @@
+import warnings
+
+import pytest
+import torch
+from pytorch_lightning.utilities.warnings import PossibleUserWarning
+
+from pl_bolts.models.detection.yolo.utils import (
+    aligned_iou,
+    global_xy,
+    grid_centers,
+    grid_offsets,
+    iou_below,
+    is_inside_box,
+)
+
+
+@pytest.mark.parametrize("width,height", [(10, 5)])
+def test_grid_offsets(width: int, height: int, catch_warnings):
+    size = torch.tensor([width, height])
+    offsets = grid_offsets(size)
+    assert offsets.shape == (height, width, 2)
+    assert torch.equal(offsets[0, :, 0], torch.arange(width, dtype=offsets.dtype))
+    assert torch.equal(offsets[0, :, 1], torch.zeros(width, dtype=offsets.dtype))
+    assert torch.equal(offsets[:, 0, 0], torch.zeros(height, dtype=offsets.dtype))
+    assert torch.equal(offsets[:, 0, 1], torch.arange(height, dtype=offsets.dtype))
+
+
+@pytest.mark.parametrize("width,height", [(10, 5)])
+def test_grid_centers(width: int, height: int, catch_warnings):
+    size = torch.tensor([width, height])
+    centers = grid_centers(size)
+    assert centers.shape == (height, width, 2)
+    assert torch.equal(centers[0, :, 0], 0.5 + torch.arange(width, dtype=torch.float))
+    assert torch.equal(centers[0, :, 1], 0.5 * torch.ones(width))
+    assert torch.equal(centers[:, 0, 0], 0.5 * torch.ones(height))
+    assert torch.equal(centers[:, 0, 1], 0.5 + torch.arange(height, dtype=torch.float))
+
+
+def test_global_xy(catch_warnings):
+    xy = torch.ones((2, 4, 4, 3, 2)) * 0.5  # 4x4 grid of coordinates to the center of the cell.
+    image_size = torch.tensor([400, 200])
+    xy = global_xy(xy, image_size)
+    assert xy.shape == (2, 4, 4, 3, 2)
+    assert torch.all(xy[:, :, 0, :, 0] == 50)
+    assert torch.all(xy[:, 0, :, :, 1] == 25)
+    assert torch.all(xy[:, :, 1, :, 0] == 150)
+    assert torch.all(xy[:, 1, :, :, 1] == 75)
+    assert torch.all(xy[:, :, 2, :, 0] == 250)
+    assert torch.all(xy[:, 2, :, :, 1] == 125)
+    assert torch.all(xy[:, :, 3, :, 0] == 350)
+    assert torch.all(xy[:, 3, :, :, 1] == 175)
+
+
+def test_is_inside_box(catch_warnings):
+    """
+    centers:
+        [[1,1; 3,1; 5,1; 7,1; 9,1; 11,1; 13,1; 15,1; 17,1; 19,1]
+         [1,3; 3,3; 5,3; 7,3; 9,3; 11,3; 13,3; 15,3; 17,3; 19,3]
+         [1,5; 3,5; 5,5; 7,5; 9,5; 11,5; 13,5; 15,5; 17,5; 19,5]
+         [1,7; 3,7; 5,7; 7,7; 9,7; 11,7; 13,7; 15,7; 17,7; 19,7]
+         [1,9; 3,9; 5,9; 7,9; 9,9; 11,9; 13,9; 15,9; 17,9; 19,9]]
+
+    is_inside[0]:
+        [[F, F, F, F, F, F, F, F, F, F]
+         [F, T, T, F, F, F, F, F, F, F]
+         [F, T, T, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]]
+
+    is_inside[1]:
+        [[F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, F, F, F]
+         [F, F, F, F, F, F, F, T, T, F]]
+    """
+    size = torch.tensor([10, 5])
+    centers = grid_centers(size) * 2.0
+    centers = centers.view(-1, 2)
+    boxes = torch.tensor([[2, 2, 6, 6], [14, 8, 18, 10]])
+    is_inside = is_inside_box(centers, boxes).view(2, 5, 10)
+    assert torch.count_nonzero(is_inside) == 6
+    assert torch.all(is_inside[0, 1:3, 1:3])
+    assert torch.all(is_inside[1, 4, 7:9])
+
+
+@pytest.mark.parametrize(
+    "dims1, dims2, expected_ious",
+    [
+        (
+            torch.tensor([[1.0, 1.0], [10.0, 1.0], [100.0, 10.0]]),
+            torch.tensor([[1.0, 10.0], [2.0, 20.0]]),
+            torch.tensor([[1.0 / 10.0, 1.0 / 40.0], [1.0 / 19.0, 2.0 / 48.0], [10.0 / 1000.0, 20.0 / 1020.0]]),
+        )
+    ],
+)
+def test_aligned_iou(dims1, dims2, expected_ious, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
+    torch.testing.assert_close(aligned_iou(dims1, dims2), expected_ious)
+
+
+def test_iou_below(catch_warnings):
+    tl = torch.rand((10, 10, 3, 2)) * 100
+    br = tl + 10
+    pred_boxes = torch.cat((tl, br), -1)
+    target_boxes = torch.stack((pred_boxes[1, 1, 0], pred_boxes[3, 5, 1]))
+    result = iou_below(pred_boxes, target_boxes, 0.9)
+    assert result.shape == (10, 10, 3)
+    assert not result[1, 1, 0]
+    assert not result[3, 5, 1]
diff --git a/tests/models/yolo/unit/test_yolo_layers.py b/tests/models/yolo/unit/test_yolo_layers.py
deleted file mode 100644
index 02e209d400..0000000000
--- a/tests/models/yolo/unit/test_yolo_layers.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import warnings
-
-import pytest
-import torch
-from pytorch_lightning.utilities.warnings import PossibleUserWarning
-
-from pl_bolts.models.detection.yolo.yolo_layers import GIoULoss, IoULoss, SELoss, _corner_coordinates
-
-
-@pytest.mark.parametrize(
-    "xy, wh, expected",
-    [
-        ([0.0, 0.0], [1.0, 1.0], [-0.5, -0.5, 0.5, 0.5]),
-        ([5.0, 5.0], [2.0, 2.0], [4.0, 4.0, 6.0, 6.0]),
-    ],
-)
-def test_corner_coordinates(xy, wh, expected, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    xy = torch.tensor(xy)
-    wh = torch.tensor(wh)
-    corners = _corner_coordinates(xy, wh)
-    assert torch.allclose(corners, torch.tensor(expected))
-
-
-@pytest.mark.parametrize(
-    "loss_func, bbox1, bbox2, expected",
-    [
-        (GIoULoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 1.4144532680511475),
-        (IoULoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 1.0),
-        (SELoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 59479.0),
-    ],
-)
-def test_loss_functions(loss_func, bbox1, bbox2, expected, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    loss_func = loss_func()
-    tensor1 = torch.tensor(bbox1, dtype=torch.float32)
-    tensor2 = torch.tensor(bbox2, dtype=torch.float32)
-
-    loss = loss_func(tensor1, tensor2)
-    assert loss.item() > 0.0
-    assert loss.item() == expected

From 37a3f7c338c2a390ce4db5127100872922046790 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Fri, 14 Oct 2022 20:58:21 +0300
Subject: [PATCH 41/76] Added a README and documentation for YOLO

---
 pl_bolts/models/detection/yolo/README.md      | 63 +++++++++++++++++++
 .../models/detection/yolo/target_matching.py  | 14 ++---
 .../models/detection/yolo/torch_networks.py   | 47 ++++++++++----
 pl_bolts/models/detection/yolo/yolo_module.py | 16 ++---
 4 files changed, 112 insertions(+), 28 deletions(-)
 create mode 100644 pl_bolts/models/detection/yolo/README.md

diff --git a/pl_bolts/models/detection/yolo/README.md b/pl_bolts/models/detection/yolo/README.md
new file mode 100644
index 0000000000..eb5bb938a8
--- /dev/null
+++ b/pl_bolts/models/detection/yolo/README.md
@@ -0,0 +1,63 @@
+# YOLO
+
+The YOLO model has evolved quite a bit, since the original publication in 2016. The original source code was written in C, using a framework called [Darknet](https://github.com/pjreddie/darknet). The final revision by the original author was called YOLOv3 and described in an [arXiv paper](https://arxiv.org/abs/1804.02767). Later various other authors have written implementations that improve various different aspects of the model or the training procedure. [YOLOv4 implementation](https://github.com/AlexeyAB/darknet) was still based on Darknet and [YOLOv5](https://github.com/ultralytics/yolov5) was written using PyTorch. Most other implementations are based on these.
+
+This PyTorch Lightning implementation combines features from some of the notable YOLO implementations. The most important papers are:
+
+- *YOLOv3*: [https://arxiv.org/abs/1804.02767](Joseph Redmon and Ali Farhadi)
+- *YOLOv4*: [https://arxiv.org/abs/2004.10934>](Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao)
+- *Scaled-YOLOv4*: [https://arxiv.org/abs/2011.08036](Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao)
+- *YOLOX*: [https://arxiv.org/abs/2107.08430](Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun)
+
+
+## Network Architecture
+
+Any network can be used with YOLO detection heads as long as it produces feature maps with the correct number of features. Typically the network consists of a CNN backbone combined with a [Feature Pyramid Network](https://arxiv.org/abs/1612.03144) or a [Path Aggregation Network](https://arxiv.org/abs/1803.01534). Backbone layers reduce the size of the feature map and the network may contain multiple detection heads that operate at different resolutions.
+
+The user can write the network architecture in PyTorch, or construct a computational graph based on a Darknet configuration file using the [`DarknetNetwork`](https://github.com/Lightning-AI/lightning-bolts/tree/master/pl_bolts/models/detection/yolo/darknet_network.py) class. The network object is passed to the YOLO constructor in the `network` argument. `DarknetNetwork` is also able to read weights from a Darknet model file.
+
+There are several network architectures included in the [`torch_networks`](https://github.com/Lightning-AI/lightning-bolts/tree/master/pl_bolts/models/detection/yolo/torch_networks.py) module (YOLOv4, YOLOv5, YOLOX). Larger and smaller variants of these models can be created by varying the `width` and `depth` arguments.
+
+
+## Anchors
+
+A detection head can try to detect objects at each of the anchor points that are spaced evenly across the image in a grid. The size of the grid is determined by the width and height of the feature map. There can be a number of anchors (typically three) per grid cell. The number of features predicted per grid cell has to be `(5 + num_classes) * anchors_per_cell`.
+
+The width and the height of a bounding box is detected relative to a prior shape. `anchors_per_cell` prior shapes per detection head are defined in the network configuration. That is, if the network uses three detection heads, and each head detects three bounding boxes per grid cell, nine prior shapes need to be defined. They are defined in the Darknet configuration file or provided to the network class constructor. The defaults values have been obtained by clustering bounding box shapes in the COCO dataset. Note that if you use a different image size, you probably want to scale the prior shapes too.
+
+With the exception of the SimOTA matching algorithm, the prior shapes are also used for matching the ground-truth targets to anchors during training. In this case targets are matched only to anchors from the closest grid cell. The prior shapes are used to determine, to which anchors from that cell the target is matched. The losses are computed between the targets boxes and the predictions that correspond to their matched anchors. Different matching rules have been implemented:
+
+- *maxiou*: The original matching rule that matches a target to the prior shape that gives the highest IoU.
+- *iou*: Matches a target to an anchor, if the IoU between the target and the prior shape is above a threshold. Multiple anchors may be matched to the same target, and the loss will be computed from a number of pairs that is generally not the same as number of ground-truth boxes.
+- *size*: Calculates the ratio between the width and height of the target box to the prior width and height. If both the width and the height are close enough to the prior shape, matches the target to the anchor.
+- *simota*: The SimOTA matching algorithm from YOLOX. Targets can be matched not only to anchors from the closest grid cell, but to any anchors that are inside the target bounding box. The matching algorithm is based on Optimal Transport and uses the training loss between the target and the predictions as the cost. That is, the prior shapes are not used for matching, but the predictions corresponding to the anchors.
+
+
+## Input Data
+
+The model input is expected to be a list of images. Each image is a tensor with shape `[channels, height, width]`. The images from a single batch will be stacked into a single tensor, so the sizes have to match. Different batches can have different image sizes. The feature pyramid network introduces another constraint on the image size: the width and the height have to be divisible by the ratio in which the network downsamples the input.
+
+During training, the model expects both the image tensors and a list of targets. Each target is a dictionary containing the following tensors:
+
+- *boxes*: `(x1, y1, x2, y2)` coordinates of the ground-truth boxes in a matrix with shape `[N, 4]`.
+- *labels*: Either integer class labels in a vector of size `N` or a class mask for each ground-truth box in a boolean matrix with shape `[N, classes]`
+
+
+## Training
+
+The command line application demonstrates how to train a YOLO model using PyTorch Lightning. The first step is to create a network, either from a Darknet configuration file, or using one of the included PyTorch networks. The network is passed to the YOLO model constructor.
+
+The data module needs to resize the data to a suitable size, in addition to any augmenting transforms. For example, YOLOv4 network requires that the width and the height are multiples of 32.
+
+
+## Inference
+
+During inference, the model requires only the input images. `forward()` method receives a mini-batch of images in a tensor with shape `[N, channels, height, width]`.
+
+Every detection head predicts a bounding box at every anchor point. `forward()` returns the predictions from all detection heads in a tensor with shape `[N, anchors, classes + 5]`, where `anchors` is the total number of anchors in all detection heads. The predictions are `x1`, `y1`, `x2`, `y2`, width, height, confidence, and the probability for each class. The coordinates are scaled to the input image size.
+
+`infer()` method filters and processes the predictions. A class-specific score is obtained by multiplying the class probability with the detection confidence. Only detections with a high enough score are kept. YOLO does not use `softmax` to normalize the class probabilities, but each probability is normalized individually using `sigmoid`. Consequently, one object can be assigned to multiple categories. If more than one class has a score that is above the confidence threshold, these will be split into multiple detections. Then the detections are filtered using non-maximum suppression. The processed output is returned in a dictionary containing the following tensors:
+
+- *boxes*: a matrix of predicted bounding box `(x1, y1, x2, y2)` coordinates in image space
+- *scores*: a vector of detection confidences
+- *labels*: a vector of predicted class labels
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index b8dd6f20c5..3dcf552c19 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -67,21 +67,21 @@ def __call__(
         cell_i = grid_xy[:, 0].to(torch.int64).clamp(0, width - 1)
         cell_j = grid_xy[:, 1].to(torch.int64).clamp(0, height - 1)
 
-        matched_targets, matched_predictors = self.match(xywh[:, 2:])
+        matched_targets, matched_anchors = self.match(xywh[:, 2:])
         cell_i = cell_i[matched_targets]
         cell_j = cell_j[matched_targets]
 
-        # Background mask is used to select predictors that are not responsible for predicting any object, for
+        # Background mask is used to select anchors that are not responsible for predicting any object, for
         # calculating the part of the confidence loss with zero as the target confidence. It is set to False, if a
         # predicted box overlaps any target significantly, or if a prediction is matched to a target.
         background_mask = iou_below(preds["boxes"], targets["boxes"], self.ignore_bg_threshold)
-        background_mask[cell_j, cell_i, matched_predictors] = False
+        background_mask[cell_j, cell_i, matched_anchors] = False
 
         preds = {
-            "boxes": preds["boxes"][cell_j, cell_i, matched_predictors],
-            "confidences": preds["confidences"][cell_j, cell_i, matched_predictors],
+            "boxes": preds["boxes"][cell_j, cell_i, matched_anchors],
+            "confidences": preds["confidences"][cell_j, cell_i, matched_anchors],
             "bg_confidences": preds["confidences"][background_mask],
-            "classprobs": preds["classprobs"][cell_j, cell_i, matched_predictors],
+            "classprobs": preds["classprobs"][cell_j, cell_i, matched_anchors],
         }
         targets = {
             "boxes": targets["boxes"][matched_targets],
@@ -123,7 +123,7 @@ def __init__(
     ) -> None:
         super().__init__(ignore_bg_threshold)
         self.prior_shapes = prior_shapes
-        # anchor_map maps the anchor indices to predictors in this layer, or to -1 if it's not an anchor of this layer.
+        # anchor_map maps the anchor indices to anchors in this layer, or to -1 if it's not an anchor of this layer.
         # This layer ignores the target if all the selected anchors are in another layer.
         self.anchor_map = [
             prior_shape_idxs.index(idx) if idx in prior_shape_idxs else -1 for idx in range(len(prior_shapes))
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index 4204112f63..b82d7a1bbb 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -228,8 +228,10 @@ class YOLOV4Backbone(nn.Module):
 
     Args:
         in_channels: Number of channels in the input image.
-        widths: Number of channels at each network stage.
-        depths: Number of bottleneck layers at each network stage.
+        widths: Number of channels at each network stage. Typically ``(32, 64, 128, 256, 512, 1024)``. The P6 variant
+            adds one more stage with 1024 channels.
+        depths: Number of bottleneck layers at each network stage. Typically ``(1, 1, 2, 8, 8, 4)``. The P6 variant uses
+            ``(1, 1, 3, 15, 15, 7, 7)``.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -291,8 +293,10 @@ class YOLOV5Backbone(nn.Module):
     Args:
         in_channels: Number of channels in the input image.
         width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
-            of channels that is a multiple of this value.
-        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
+            of channels that is a multiple of this value. The values used by the different variants are 16 (yolov5n), 32
+            (yolov5s), 48 (yolov5m), 64 (yolov5l), and 80 (yolov5x).
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper. The values used by
+            the different variants are 1 (yolov5n, yolov5s), 2 (yolov5m), 3 (yolov5l), and 4 (yolov5x).
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -880,9 +884,11 @@ class YOLOV5Network(nn.Module):
     Args:
         num_classes: Number of different classes that this model predicts.
         backbone: A backbone network that returns the output from each stage.
-        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
-            number of channels that is a multiple of this value.
-        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
+        width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
+            of channels that is a multiple of this value. The values used by the different variants are 16 (yolov5n), 32
+            (yolov5s), 48 (yolov5m), 64 (yolov5l), and 80 (yolov5x).
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper. The values used by
+            the different variants are 1 (yolov5n, yolov5s), 2 (yolov5m), 3 (yolov5l), and 4 (yolov5x).
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -1050,6 +1056,19 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
 
 class YOLOXHead(nn.Module):
+    """A module that produces features for YOLO detection layer, decoupling the classification and localization
+    features.
+
+    Args:
+        in_channels: Number of input channels that the module expects.
+        hidden_channels: Number of output channels in the hidden layers.
+        anchors_per_cell: Number of detections made at each spatial location of the feature map.
+        num_classes: Number of different classes that this model predicts.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
     def __init__(
         self,
         in_channels: int,
@@ -1057,12 +1076,12 @@ def __init__(
         anchors_per_cell: int,
         num_classes: int,
         activation: Optional[str] = "silu",
-        normalization: Optional[str] = "batchnorm",
+        norm: Optional[str] = "batchnorm",
     ) -> None:
         super().__init__()
 
         def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
+            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=norm)
 
         def linear(in_channels: int, out_channels: int) -> nn.Module:
             return nn.Conv2d(in_channels, out_channels, kernel_size=1)
@@ -1100,9 +1119,11 @@ class YOLOXNetwork(nn.Module):
     Args:
         num_classes: Number of different classes that this model predicts.
         backbone: A backbone network that returns the output from each stage.
-        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
-            number of channels that is a multiple of this value.
-        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper.
+        width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
+            of channels that is a multiple of this value. The values used by the different variants are 24 (yolox-tiny),
+            32 (yolox-s), 48 (yolox-m), and 64 (yolox-l).
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper. The values used by
+            the different variants are 1 (yolox-tiny, yolox-s), 2 (yolox-m), and 3 (yolox-l).
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -1180,7 +1201,7 @@ def head(in_channels: int, hidden_channels: int) -> YOLOXHead:
                 anchors_per_cell,
                 num_classes,
                 activation=activation,
-                normalization=normalization,
+                norm=normalization,
             )
 
         def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 257f8105f1..2a68bf3115 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -4,7 +4,7 @@
 import torch
 import torch.nn as nn
 from pytorch_lightning import LightningModule
-from pytorch_lightning.cli import LightningCLI
+from pytorch_lightning.utilities.cli import LightningCLI
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
 from torch import Tensor, optim
 
@@ -101,7 +101,7 @@ class YOLO(LightningModule):
     have to match. Different batches can have different image sizes, as long as the size is divisible by the ratio in
     which the network downsamples the input.
 
-    During training, the model expects both the input tensors and a list of targets. *Each target is a dictionary
+    During training, the model expects both the image tensors and a list of targets. *Each target is a dictionary
     containing the following tensors*:
 
     - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in `(x1, y1, x2, y2)` format
@@ -109,9 +109,9 @@ class YOLO(LightningModule):
       ground-truth box
 
     :func:`~pl_bolts.models.detection.yolo.yolo_module.YOLO.forward` method returns all predictions from all detection
-    layers in one tensor with shape ``[images, predictors, classes + 5]``. The coordinates are scaled to the input image
-    size. During training it also returns a dictionary containing the classification, box overlap, and confidence
-    losses.
+    layers in one tensor with shape ``[N, anchors, classes + 5]``, where ``anchors`` is the total number of anchors in
+    all detection layers. The coordinates are scaled to the input image size. During training it also returns a
+    dictionary containing the classification, box overlap, and confidence losses.
 
     During inference, the model requires only the image tensors.
     :func:`~pl_bolts.models.detection.yolo.yolo_module.YOLO.infer` method filters and processes the predictions. If a
@@ -191,9 +191,9 @@ def forward(  # type: ignore
         Returns:
             detections (:class:`~torch.Tensor`), losses (Dict[str, :class:`~torch.Tensor`]): Detections, and if targets
             were provided, a dictionary of losses. Detections are shaped
-            ``[batch_size, predictors, classes + 5]``, where ``predictors`` is the total number of feature map cells in
-            all detection layers times the number of anchors per cell. The predicted box coordinates are in
-            `(x1, y1, x2, y2)` format and scaled to the input image size.
+            ``[batch_size, anchors, classes + 5]``, where ``anchors`` is the feature map size (width * height) times the
+            number of anchors per cell. The predicted box coordinates are in `(x1, y1, x2, y2)` format and scaled to the
+            input image size.
         """
         detections, losses, hits = self.network(images, targets)
 

From de69cb26afbdce47fe4726189e5538267225c0a5 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Sat, 15 Oct 2022 12:50:32 +0300
Subject: [PATCH 42/76] YOLO tests use giou loss, which is available in
 Torchvision 0.12

---
 pl_bolts/models/detection/yolo/loss.py | 61 ++++++++++++++++----------
 tests/models/test_detection.py         | 20 ++++-----
 2 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index 37227085d7..15b8b43be8 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -39,6 +39,40 @@
         complete_box_iou_loss = None
 
 
+def _get_iou_and_loss_functions(name: str):
+    """Returns functions for calculating the IoU and the IoU loss, given the IoU variant name.
+
+    Args:
+        name: Name of the IoU variant. Either "iou", "giou", "diou", or "ciou".
+
+    Returns:
+        A tuple of two functions. The first function calculates the pairwise IoU and the second function calculates the
+        elementwise loss.
+    """
+    if name == "iou":
+        iou_func = box_iou
+        loss_func = None
+    elif name == "giou":
+        iou_func = generalized_box_iou
+        loss_func = generalized_box_iou_loss
+    elif name == "diou":
+        iou_func = distance_box_iou
+        loss_func = distance_box_iou_loss
+    elif name == "ciou":
+        iou_func = complete_box_iou
+        loss_func = complete_box_iou_loss
+    else:
+        raise ValueError(f"Unknown IoU function '{name}'.")
+
+    if not callable(iou_func):
+        raise ValueError(f"The IoU function '{name}' is not supported by the installed version of Torchvision.")
+
+    if not callable(loss_func):
+        loss_func = lambda boxes1, boxes2: 1.0 - iou_func(boxes1, boxes2).diagonal()
+
+    return iou_func, loss_func
+
+
 def _size_compensation(targets: Tensor, image_size: Tensor) -> Tuple[Tensor, Tensor]:
     """Calcuates the size compensation factor for the overlap loss.
 
@@ -186,30 +220,11 @@ def __init__(
         confidence_multiplier: float = 1.0,
         class_multiplier: float = 1.0,
     ):
-        overlap_loss_func = None
-        if overlap_func == "iou":
-            overlap_func = box_iou
-        elif overlap_func == "giou":
-            overlap_func = generalized_box_iou
-            overlap_loss_func = generalized_box_iou_loss
-        elif overlap_func == "diou":
-            overlap_func = distance_box_iou
-            overlap_loss_func = distance_box_iou_loss
-        elif overlap_func == "ciou":
-            overlap_func = complete_box_iou
-            overlap_loss_func = complete_box_iou_loss
-
-        if not callable(overlap_func):
-            raise ValueError(
-                f"Unsupported overlap function '{overlap_func}'. Try upgrading Torchvision or using another IoU "
-                "algorithm."
-            )
-        self._pairwise_overlap = overlap_func
-
-        if callable(overlap_loss_func):
-            self._elementwise_overlap_loss = overlap_loss_func
-        else:
+        if callable(overlap_func):
+            self._pairwise_overlap = overlap_func
             self._elementwise_overlap_loss = lambda boxes1, boxes2: 1.0 - overlap_func(boxes1, boxes2).diagonal()
+        else:
+            self._pairwise_overlap, self._elementwise_overlap_loss = _get_iou_and_loss_functions(overlap_func)
 
         self.predict_overlap = predict_overlap
         self.overlap_multiplier = overlap_multiplier
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 574bd82f83..933d0157e3 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -126,7 +126,7 @@ def test_darknet_train(tmpdir, cfg_name, catch_warnings):
 
 
 def test_yolov4_tiny(catch_warnings):
-    network = YOLOV4TinyNetwork(num_classes=2, width=4)
+    network = YOLOV4TinyNetwork(num_classes=2, width=4, overlap_func="giou")
     model = YOLO(network)
 
     image = torch.rand(1, 3, 256, 256)
@@ -140,7 +140,7 @@ def test_yolov4_tiny_train(tmpdir):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4TinyNetwork(num_classes=2, width=4)
+    network = YOLOV4TinyNetwork(num_classes=2, width=4, overlap_func="giou")
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -151,7 +151,7 @@ def test_yolov4_tiny_train(tmpdir):
 
 
 def test_yolov4(catch_warnings):
-    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
     model = YOLO(network)
 
     image = torch.rand(1, 3, 256, 256)
@@ -165,7 +165,7 @@ def test_yolov4_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -176,7 +176,7 @@ def test_yolov4_train(tmpdir, catch_warnings):
 
 
 def test_yolov4p6(catch_warnings):
-    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
+    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128), overlap_func="giou")
     model = YOLO(network)
 
     image = torch.rand(1, 3, 256, 256)
@@ -190,7 +190,7 @@ def test_yolov4p6_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
+    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128), overlap_func="giou")
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -201,7 +201,7 @@ def test_yolov4p6_train(tmpdir, catch_warnings):
 
 
 def test_yolov5(catch_warnings):
-    network = YOLOV5Network(num_classes=2, depth=1, width=4)
+    network = YOLOV5Network(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network)
 
     image = torch.rand(1, 3, 256, 256)
@@ -215,7 +215,7 @@ def test_yolov5_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV5Network(num_classes=2, depth=1, width=4)
+    network = YOLOV5Network(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -226,7 +226,7 @@ def test_yolov5_train(tmpdir, catch_warnings):
 
 
 def test_yolox(catch_warnings):
-    network = YOLOXNetwork(num_classes=2, depth=1, width=4)
+    network = YOLOXNetwork(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network)
 
     image = torch.rand(1, 3, 256, 256)
@@ -240,7 +240,7 @@ def test_yolox_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOXNetwork(num_classes=2, depth=1, width=4)
+    network = YOLOXNetwork(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)

From ff2a5210c3a5513a78521e0f71e5191540cc49f4 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Sat, 15 Oct 2022 13:36:55 +0300
Subject: [PATCH 43/76] Fixed type annotation

---
 pl_bolts/models/detection/yolo/loss.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index 15b8b43be8..5df916d49b 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -1,3 +1,4 @@
+from ast import Call
 from dataclasses import dataclass
 from typing import Callable, Dict, Optional, Tuple, Union
 
@@ -39,7 +40,7 @@
         complete_box_iou_loss = None
 
 
-def _get_iou_and_loss_functions(name: str):
+def _get_iou_and_loss_functions(name: str) -> Tuple[Callable, Callable]:
     """Returns functions for calculating the IoU and the IoU loss, given the IoU variant name.
 
     Args:

From a040ffd52bbd7e554c8e17c208e724abe94322bf Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Sat, 15 Oct 2022 13:40:12 +0300
Subject: [PATCH 44/76] Removed unused import

---
 pl_bolts/models/detection/yolo/loss.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index 5df916d49b..e76a12f60c 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -1,4 +1,3 @@
-from ast import Call
 from dataclasses import dataclass
 from typing import Callable, Dict, Optional, Tuple, Union
 

From 2bc8a31b7b8cf92c036286007b4507909628ad68 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Sat, 15 Oct 2022 16:23:33 +0300
Subject: [PATCH 45/76] Check typing for YOLO

---
 pl_bolts/models/detection/yolo/yolo_module.py | 2 +-
 pyproject.toml                                | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 2a68bf3115..0363bc5512 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -494,7 +494,7 @@ def __init__(
         **kwargs: Any,
     ) -> None:
         if network_config is not None:
-            network = DarknetNetwork(
+            network: nn.Module = DarknetNetwork(
                 network_config,
                 darknet_weights,
                 matching_algorithm=matching_algorithm,
diff --git a/pyproject.toml b/pyproject.toml
index a7575c8baa..81ad91a392 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,9 +66,6 @@ module = [
     "pl_bolts.models.detection.faster_rcnn.faster_rcnn_module",
     "pl_bolts.models.detection.retinanet.backbones",
     "pl_bolts.models.detection.retinanet.retinanet_module",
-    "pl_bolts.models.detection.yolo.yolo_config",
-    "pl_bolts.models.detection.yolo.yolo_layers",
-    "pl_bolts.models.detection.yolo.yolo_module",
     "pl_bolts.models.gans.basic.basic_gan_module",
     "pl_bolts.models.gans.basic.components",
     "pl_bolts.models.gans.dcgan.dcgan_module",

From cc21337d8f6de7807759f71310bf98ff60c193c4 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 17 Oct 2022 13:04:40 +0300
Subject: [PATCH 46/76] Fixed hyperlinks

---
 pl_bolts/models/detection/yolo/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/README.md b/pl_bolts/models/detection/yolo/README.md
index eb5bb938a8..9b8c5e2d29 100644
--- a/pl_bolts/models/detection/yolo/README.md
+++ b/pl_bolts/models/detection/yolo/README.md
@@ -1,13 +1,13 @@
 # YOLO
 
-The YOLO model has evolved quite a bit, since the original publication in 2016. The original source code was written in C, using a framework called [Darknet](https://github.com/pjreddie/darknet). The final revision by the original author was called YOLOv3 and described in an [arXiv paper](https://arxiv.org/abs/1804.02767). Later various other authors have written implementations that improve various different aspects of the model or the training procedure. [YOLOv4 implementation](https://github.com/AlexeyAB/darknet) was still based on Darknet and [YOLOv5](https://github.com/ultralytics/yolov5) was written using PyTorch. Most other implementations are based on these.
+The YOLO model has evolved quite a bit, since the original publication in 2016. The original source code was written in C, using a framework called [Darknet](https://github.com/pjreddie/darknet). The final revision by the original author was called YOLOv3 and described in an [arXiv paper](https://arxiv.org/abs/1804.02767). Later various other authors have written implementations that improve different aspects of the model or the training procedure. [YOLOv4 implementation](https://github.com/AlexeyAB/darknet) was still based on Darknet and [YOLOv5](https://github.com/ultralytics/yolov5) was written using PyTorch. Most other implementations are based on these.
 
 This PyTorch Lightning implementation combines features from some of the notable YOLO implementations. The most important papers are:
 
-- *YOLOv3*: [https://arxiv.org/abs/1804.02767](Joseph Redmon and Ali Farhadi)
-- *YOLOv4*: [https://arxiv.org/abs/2004.10934>](Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao)
-- *Scaled-YOLOv4*: [https://arxiv.org/abs/2011.08036](Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao)
-- *YOLOX*: [https://arxiv.org/abs/2107.08430](Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun)
+- *YOLOv3*: [Joseph Redmon and Ali Farhadi](https://arxiv.org/abs/1804.02767)
+- *YOLOv4*: [Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao](https://arxiv.org/abs/2004.10934)
+- *Scaled-YOLOv4*: [Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao](https://arxiv.org/abs/2011.08036)
+- *YOLOX*: [Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun](https://arxiv.org/abs/2107.08430)
 
 
 ## Network Architecture
@@ -25,10 +25,10 @@ A detection head can try to detect objects at each of the anchor points that are
 
 The width and the height of a bounding box is detected relative to a prior shape. `anchors_per_cell` prior shapes per detection head are defined in the network configuration. That is, if the network uses three detection heads, and each head detects three bounding boxes per grid cell, nine prior shapes need to be defined. They are defined in the Darknet configuration file or provided to the network class constructor. The defaults values have been obtained by clustering bounding box shapes in the COCO dataset. Note that if you use a different image size, you probably want to scale the prior shapes too.
 
-With the exception of the SimOTA matching algorithm, the prior shapes are also used for matching the ground-truth targets to anchors during training. In this case targets are matched only to anchors from the closest grid cell. The prior shapes are used to determine, to which anchors from that cell the target is matched. The losses are computed between the targets boxes and the predictions that correspond to their matched anchors. Different matching rules have been implemented:
+With the exception of the SimOTA matching algorithm, the prior shapes are also used for matching the ground-truth targets to anchors during training. In this case targets are matched only to anchors from the closest grid cell. The prior shapes are used to determine, to which anchors from that cell the target is matched. The losses are computed between the target boxes and the predictions that correspond to their matched anchors. Different matching rules have been implemented:
 
 - *maxiou*: The original matching rule that matches a target to the prior shape that gives the highest IoU.
-- *iou*: Matches a target to an anchor, if the IoU between the target and the prior shape is above a threshold. Multiple anchors may be matched to the same target, and the loss will be computed from a number of pairs that is generally not the same as number of ground-truth boxes.
+- *iou*: Matches a target to an anchor, if the IoU between the target and the prior shape is above a threshold. Multiple anchors may be matched to the same target, and the loss will be computed from a number of pairs that is generally not the same as the number of ground-truth boxes.
 - *size*: Calculates the ratio between the width and height of the target box to the prior width and height. If both the width and the height are close enough to the prior shape, matches the target to the anchor.
 - *simota*: The SimOTA matching algorithm from YOLOX. Targets can be matched not only to anchors from the closest grid cell, but to any anchors that are inside the target bounding box. The matching algorithm is based on Optimal Transport and uses the training loss between the target and the predictions as the cost. That is, the prior shapes are not used for matching, but the predictions corresponding to the anchors.
 
@@ -54,9 +54,9 @@ The data module needs to resize the data to a suitable size, in addition to any
 
 During inference, the model requires only the input images. `forward()` method receives a mini-batch of images in a tensor with shape `[N, channels, height, width]`.
 
-Every detection head predicts a bounding box at every anchor point. `forward()` returns the predictions from all detection heads in a tensor with shape `[N, anchors, classes + 5]`, where `anchors` is the total number of anchors in all detection heads. The predictions are `x1`, `y1`, `x2`, `y2`, width, height, confidence, and the probability for each class. The coordinates are scaled to the input image size.
+Every detection head predicts a bounding box at every anchor. `forward()` returns the predictions from all detection heads in a tensor with shape `[N, anchors, classes + 5]`, where `anchors` is the total number of anchors in all detection heads. The predictions are `x1`, `y1`, `x2`, `y2`, confidence, and the probability for each class. The coordinates are scaled to the input image size.
 
-`infer()` method filters and processes the predictions. A class-specific score is obtained by multiplying the class probability with the detection confidence. Only detections with a high enough score are kept. YOLO does not use `softmax` to normalize the class probabilities, but each probability is normalized individually using `sigmoid`. Consequently, one object can be assigned to multiple categories. If more than one class has a score that is above the confidence threshold, these will be split into multiple detections. Then the detections are filtered using non-maximum suppression. The processed output is returned in a dictionary containing the following tensors:
+`infer()` method filters and processes the predictions. A class-specific score is obtained by multiplying the class probability with the detection confidence. Only detections with a high enough score are kept. YOLO does not use `softmax` to normalize the class probabilities, but each probability is normalized individually using `sigmoid`. Consequently, one object can be assigned to multiple categories. If more than one class has a score that is above the confidence threshold, these will be split into multiple detections during postprocessing. Then the detections are filtered using non-maximum suppression. The processed output is returned in a dictionary containing the following tensors:
 
 - *boxes*: a matrix of predicted bounding box `(x1, y1, x2, y2)` coordinates in image space
 - *scores*: a vector of detection confidences

From 4083bd22a822dd8b4c68a4be0210794b3d560ebd Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Sun, 23 Oct 2022 12:04:27 +0300
Subject: [PATCH 47/76] Fixed mypy errors

---
 pl_bolts/models/detection/yolo/yolo_module.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 0363bc5512..9f270ebd25 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -19,7 +19,7 @@
 
 if _TORCHMETRICS_DETECTION_AVAILABLE:
     try:
-        from torchmetrics.detection import MeanAveragePrecision  # type: ignore
+        from torchmetrics.detection import MeanAveragePrecision
 
         _MEAN_AVERAGE_PRECISION_AVAILABLE = True
     except ImportError:
@@ -173,9 +173,7 @@ def __init__(
             self._val_map = MeanAveragePrecision()
             self._test_map = MeanAveragePrecision()
 
-    def forward(  # type: ignore
-        self, images: Tensor, targets: Optional[TARGETS] = None
-    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+    def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Tensor, Tuple[Tensor, Tensor]]:
         """Runs a forward pass through the network (all layers listed in ``self.network``), and if training targets
         are provided, computes the losses from the detection layers.
 
@@ -239,7 +237,7 @@ def configure_optimizers(self) -> Tuple[List[optim.Optimizer], List[optim.lr_sch
         lr_scheduler = self.lr_scheduler_class(optimizer, **self.lr_scheduler_params)
         return [optimizer], [lr_scheduler]
 
-    def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> STEP_OUTPUT:  # type: ignore
+    def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> STEP_OUTPUT:
         """Computes the training loss.
 
         Args:
@@ -262,9 +260,7 @@ def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) ->
 
         return {"loss": losses.sum()}
 
-    def validation_step(  # type: ignore
-        self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int
-    ) -> Optional[STEP_OUTPUT]:
+    def validation_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Optional[STEP_OUTPUT]:
         """Evaluates a batch of data from the validation set.
 
         Args:
@@ -292,7 +288,7 @@ def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]])
             self.log_dict(map_scores, sync_dist=True)
             self._val_map.reset()
 
-    def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Optional[STEP_OUTPUT]:  # type: ignore
+    def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Optional[STEP_OUTPUT]:
         """Evaluates a batch of data from the test set.
 
         Args:
@@ -340,6 +336,7 @@ class labels.
         images, _ = validate_batch(batch)
         detections = self(images)
         detections = self.process_detections(detections)
+        return detections
 
     def infer(self, image: Tensor) -> Dict[str, Tensor]:
         """Feeds an image to the network and returns the detected bounding boxes, confidence scores, and class

From 8457fee431ea3b9274b8ddb8de6a79e5c3cd44f7 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 24 Oct 2022 20:28:11 +0300
Subject: [PATCH 48/76] Removed iou and giou metrics and losses, as these are
 provided by Torchvision

---
 docs/source/losses.rst                 | 26 ---------
 pl_bolts/losses/object_detection.py    | 58 --------------------
 pl_bolts/metrics/object_detection.py   | 75 --------------------------
 tests/losses/test_object_detection.py  | 44 ---------------
 tests/metrics/test_object_detection.py | 72 -------------------------
 5 files changed, 275 deletions(-)
 delete mode 100644 pl_bolts/losses/object_detection.py
 delete mode 100644 pl_bolts/metrics/object_detection.py
 delete mode 100644 tests/losses/test_object_detection.py
 delete mode 100644 tests/metrics/test_object_detection.py

diff --git a/docs/source/losses.rst b/docs/source/losses.rst
index ad6ae75f0e..4bfaa8d671 100644
--- a/docs/source/losses.rst
+++ b/docs/source/losses.rst
@@ -1,29 +1,3 @@
-Object Detection
-======================
-These are common losses used in object detection.
-
-.. note::
-
-    We rely on the community to keep these updated and working. If something doesn't work, we'd really appreciate a contribution to fix!
-
----------------
-
-GIoU Loss
----------
-
-.. autofunction:: pl_bolts.losses.object_detection.giou_loss
-    :noindex:
-
----------------
-
-IoU Loss
---------
-
-.. autofunction:: pl_bolts.losses.object_detection.iou_loss
-    :noindex:
-
----------------
-
 Reinforcement Learning
 ======================
 These are common losses used in RL.
diff --git a/pl_bolts/losses/object_detection.py b/pl_bolts/losses/object_detection.py
deleted file mode 100644
index b7c8fef6e8..0000000000
--- a/pl_bolts/losses/object_detection.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""Loss functions for Object Detection task."""
-
-from torch import Tensor
-
-from pl_bolts.metrics.object_detection import giou, iou
-from pl_bolts.utils.stability import under_review
-
-
-@under_review()
-def iou_loss(preds: Tensor, target: Tensor) -> Tensor:
-    """Calculates the intersection over union loss.
-
-    Args:
-        preds: batch of prediction bounding boxes with representation ``[x_min, y_min, x_max, y_max]``
-        target: batch of target bounding boxes with representation ``[x_min, y_min, x_max, y_max]``
-
-    Example:
-
-        >>> import torch
-        >>> from pl_bolts.losses.object_detection import iou_loss
-        >>> preds = torch.tensor([[100, 100, 200, 200]])
-        >>> target = torch.tensor([[150, 150, 250, 250]])
-        >>> iou_loss(preds, target)
-        tensor([[0.8571]])
-
-    Returns:
-        IoU loss
-    """
-    loss = 1 - iou(preds, target)
-    return loss
-
-
-@under_review()
-def giou_loss(preds: Tensor, target: Tensor) -> Tensor:
-    """Calculates the generalized intersection over union loss.
-
-    It has been proposed in `Generalized Intersection over Union: A Metric and A
-    Loss for Bounding Box Regression <https://arxiv.org/abs/1902.09630>`_.
-
-    Args:
-        preds: an Nx4 batch of prediction bounding boxes with representation ``[x_min, y_min, x_max, y_max]``
-        target: an Mx4 batch of target bounding boxes with representation ``[x_min, y_min, x_max, y_max]``
-
-    Example:
-
-        >>> import torch
-        >>> from pl_bolts.losses.object_detection import giou_loss
-        >>> preds = torch.tensor([[100, 100, 200, 200]])
-        >>> target = torch.tensor([[150, 150, 250, 250]])
-        >>> giou_loss(preds, target)
-        tensor([[1.0794]])
-
-    Returns:
-        GIoU loss in an NxM tensor containing the pairwise GIoU loss for every element in preds and target,
-        where N is the number of prediction bounding boxes and M is the number of target bounding boxes
-    """
-    loss = 1 - giou(preds, target)
-    return loss
diff --git a/pl_bolts/metrics/object_detection.py b/pl_bolts/metrics/object_detection.py
deleted file mode 100644
index 55f7582d79..0000000000
--- a/pl_bolts/metrics/object_detection.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-from torch import Tensor
-
-
-def iou(preds: Tensor, target: Tensor) -> Tensor:
-    """Calculates the intersection over union.
-
-    Args:
-        preds: an Nx4 batch of prediction bounding boxes with representation ``[x_min, y_min, x_max, y_max]``
-        target: an Mx4 batch of target bounding boxes with representation ``[x_min, y_min, x_max, y_max]``
-
-    Example:
-
-        >>> import torch
-        >>> from pl_bolts.metrics.object_detection import iou
-        >>> preds = torch.tensor([[100, 100, 200, 200]])
-        >>> target = torch.tensor([[150, 150, 250, 250]])
-        >>> iou(preds, target)
-        tensor([[0.1429]])
-
-    Returns:
-        IoU tensor: an NxM tensor containing the pairwise IoU values for every element in preds and target,
-                    where N is the number of prediction bounding boxes and M is the number of target bounding boxes
-    """
-    x_min = torch.max(preds[:, None, 0], target[:, 0])
-    y_min = torch.max(preds[:, None, 1], target[:, 1])
-    x_max = torch.min(preds[:, None, 2], target[:, 2])
-    y_max = torch.min(preds[:, None, 3], target[:, 3])
-    intersection = (x_max - x_min).clamp(min=0) * (y_max - y_min).clamp(min=0)
-    pred_area = (preds[:, 2] - preds[:, 0]) * (preds[:, 3] - preds[:, 1])
-    target_area = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
-    union = pred_area[:, None] + target_area - intersection
-    iou_value = torch.true_divide(intersection, union)
-    return iou_value
-
-
-def giou(preds: Tensor, target: Tensor) -> Tensor:
-    """Calculates the generalized intersection over union.
-
-    It has been proposed in `Generalized Intersection over Union: A Metric and A
-    Loss for Bounding Box Regression <https://arxiv.org/abs/1902.09630>`_.
-
-    Args:
-        preds: an Nx4 batch of prediction bounding boxes with representation ``[x_min, y_min, x_max, y_max]``
-        target: an Mx4 batch of target bounding boxes with representation ``[x_min, y_min, x_max, y_max]``
-
-    Example:
-
-        >>> import torch
-        >>> from pl_bolts.metrics.object_detection import giou
-        >>> preds = torch.tensor([[100, 100, 200, 200]])
-        >>> target = torch.tensor([[150, 150, 250, 250]])
-        >>> giou(preds, target)
-        tensor([[-0.0794]])
-
-    Returns:
-        GIoU in an NxM tensor containing the pairwise GIoU values for every element in preds and target,
-        where N is the number of prediction bounding boxes and M is the number of target bounding boxes
-    """
-    x_min = torch.max(preds[:, None, 0], target[:, 0])
-    y_min = torch.max(preds[:, None, 1], target[:, 1])
-    x_max = torch.min(preds[:, None, 2], target[:, 2])
-    y_max = torch.min(preds[:, None, 3], target[:, 3])
-    intersection = (x_max - x_min).clamp(min=0) * (y_max - y_min).clamp(min=0)
-    pred_area = (preds[:, 2] - preds[:, 0]) * (preds[:, 3] - preds[:, 1])
-    target_area = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
-    union = pred_area[:, None] + target_area - intersection
-    C_x_min = torch.min(preds[:, None, 0], target[:, 0])
-    C_y_min = torch.min(preds[:, None, 1], target[:, 1])
-    C_x_max = torch.max(preds[:, None, 2], target[:, 2])
-    C_y_max = torch.max(preds[:, None, 3], target[:, 3])
-    C_area = (C_x_max - C_x_min).clamp(min=0) * (C_y_max - C_y_min).clamp(min=0)
-    iou_value = torch.true_divide(intersection, union)
-    giou_value = iou_value - torch.true_divide((C_area - union), C_area)
-    return giou_value
diff --git a/tests/losses/test_object_detection.py b/tests/losses/test_object_detection.py
deleted file mode 100644
index 7d95ae1ef3..0000000000
--- a/tests/losses/test_object_detection.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""Test Object Detection Loss Functions."""
-
-import pytest
-import torch
-
-from pl_bolts.losses.object_detection import giou_loss, iou_loss
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_loss",
-    [(torch.tensor([[100, 100, 200, 200]]), torch.tensor([[100, 100, 200, 200]]), torch.tensor([[0.0]]))],
-)
-def test_iou_complete_overlap(preds, target, expected_loss):
-    torch.testing.assert_allclose(iou_loss(preds, target), expected_loss)
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_loss",
-    [
-        (torch.tensor([[100, 100, 200, 200]]), torch.tensor([[100, 200, 200, 300]]), torch.tensor([[1.0]])),
-        (torch.tensor([[100, 100, 200, 200]]), torch.tensor([[200, 200, 300, 300]]), torch.tensor([[1.0]])),
-    ],
-)
-def test_iou_no_overlap(preds, target, expected_loss):
-    torch.testing.assert_allclose(iou_loss(preds, target), expected_loss)
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_loss",
-    [(torch.tensor([[100, 100, 200, 200]]), torch.tensor([[100, 100, 200, 200]]), torch.tensor([[0.0]]))],
-)
-def test_complete_overlap(preds, target, expected_loss):
-    torch.testing.assert_allclose(giou_loss(preds, target), expected_loss)
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_loss",
-    [
-        (torch.tensor([[100, 100, 200, 200]]), torch.tensor([[100, 200, 200, 300]]), torch.tensor([[1.0]])),
-        (torch.tensor([[100, 100, 200, 200]]), torch.tensor([[200, 200, 300, 300]]), torch.tensor([[1.5]])),
-    ],
-)
-def test_no_overlap(preds, target, expected_loss):
-    torch.testing.assert_allclose(giou_loss(preds, target), expected_loss)
diff --git a/tests/metrics/test_object_detection.py b/tests/metrics/test_object_detection.py
deleted file mode 100644
index 6eb78d7247..0000000000
--- a/tests/metrics/test_object_detection.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""Test Object Detection Metric Functions."""
-
-import pytest
-import torch
-
-from pl_bolts.metrics.object_detection import giou, iou
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_iou",
-    [(torch.tensor([[100, 100, 200, 200]]), torch.tensor([[100, 100, 200, 200]]), torch.tensor([[1.0]]))],
-)
-def test_iou_complete_overlap(preds, target, expected_iou):
-    torch.testing.assert_close(iou(preds, target), expected_iou)
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_iou",
-    [
-        (torch.tensor([[100, 100, 200, 200]]), torch.tensor([[100, 200, 200, 300]]), torch.tensor([[0.0]])),
-        (torch.tensor([[100, 100, 200, 200]]), torch.tensor([[200, 200, 300, 300]]), torch.tensor([[0.0]])),
-    ],
-)
-def test_iou_no_overlap(preds, target, expected_iou):
-    torch.testing.assert_close(iou(preds, target), expected_iou)
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_iou",
-    [
-        (
-            torch.tensor([[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]),
-            torch.tensor([[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]),
-            torch.tensor([[1.0, 0.25, 0.0], [0.25, 1.0, 0.0], [0.0, 0.0, 1.0]]),
-        )
-    ],
-)
-def test_iou_multi(preds, target, expected_iou):
-    torch.testing.assert_close(iou(preds, target), expected_iou)
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_giou",
-    [(torch.tensor([[100, 100, 200, 200]]), torch.tensor([[100, 100, 200, 200]]), torch.tensor([[1.0]]))],
-)
-def test_complete_overlap(preds, target, expected_giou):
-    torch.testing.assert_close(giou(preds, target), expected_giou)
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_giou",
-    [
-        (torch.tensor([[100, 100, 200, 200]]), torch.tensor([[100, 200, 200, 300]]), torch.tensor([[0.0]])),
-        (torch.tensor([[100, 100, 200, 200]]), torch.tensor([[200, 200, 300, 300]]), torch.tensor([[-0.5]])),
-    ],
-)
-def test_no_overlap(preds, target, expected_giou):
-    torch.testing.assert_close(giou(preds, target), expected_giou)
-
-
-@pytest.mark.parametrize(
-    "preds, target, expected_giou",
-    [
-        (
-            torch.tensor([[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]),
-            torch.tensor([[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]),
-            torch.tensor([[1.0, 0.25, -0.7778], [0.25, 1.0, -0.8611], [-0.7778, -0.8611, 1.0]]),
-        )
-    ],
-)
-def test_giou_multi(preds, target, expected_giou):
-    torch.testing.assert_close(giou(preds, target), expected_giou, atol=0.0001, rtol=0.0001)

From 8d4a3b4614603f7688d0cb1d0900c38be543ba08 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 7 Nov 2022 16:52:32 +0200
Subject: [PATCH 49/76] Fixed by mdformat

---
 pl_bolts/models/detection/yolo/README.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/README.md b/pl_bolts/models/detection/yolo/README.md
index 9b8c5e2d29..67df445502 100644
--- a/pl_bolts/models/detection/yolo/README.md
+++ b/pl_bolts/models/detection/yolo/README.md
@@ -9,7 +9,6 @@ This PyTorch Lightning implementation combines features from some of the notable
 - *Scaled-YOLOv4*: [Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao](https://arxiv.org/abs/2011.08036)
 - *YOLOX*: [Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun](https://arxiv.org/abs/2107.08430)
 
-
 ## Network Architecture
 
 Any network can be used with YOLO detection heads as long as it produces feature maps with the correct number of features. Typically the network consists of a CNN backbone combined with a [Feature Pyramid Network](https://arxiv.org/abs/1612.03144) or a [Path Aggregation Network](https://arxiv.org/abs/1803.01534). Backbone layers reduce the size of the feature map and the network may contain multiple detection heads that operate at different resolutions.
@@ -18,7 +17,6 @@ The user can write the network architecture in PyTorch, or construct a computati
 
 There are several network architectures included in the [`torch_networks`](https://github.com/Lightning-AI/lightning-bolts/tree/master/pl_bolts/models/detection/yolo/torch_networks.py) module (YOLOv4, YOLOv5, YOLOX). Larger and smaller variants of these models can be created by varying the `width` and `depth` arguments.
 
-
 ## Anchors
 
 A detection head can try to detect objects at each of the anchor points that are spaced evenly across the image in a grid. The size of the grid is determined by the width and height of the feature map. There can be a number of anchors (typically three) per grid cell. The number of features predicted per grid cell has to be `(5 + num_classes) * anchors_per_cell`.
@@ -32,7 +30,6 @@ With the exception of the SimOTA matching algorithm, the prior shapes are also u
 - *size*: Calculates the ratio between the width and height of the target box to the prior width and height. If both the width and the height are close enough to the prior shape, matches the target to the anchor.
 - *simota*: The SimOTA matching algorithm from YOLOX. Targets can be matched not only to anchors from the closest grid cell, but to any anchors that are inside the target bounding box. The matching algorithm is based on Optimal Transport and uses the training loss between the target and the predictions as the cost. That is, the prior shapes are not used for matching, but the predictions corresponding to the anchors.
 
-
 ## Input Data
 
 The model input is expected to be a list of images. Each image is a tensor with shape `[channels, height, width]`. The images from a single batch will be stacked into a single tensor, so the sizes have to match. Different batches can have different image sizes. The feature pyramid network introduces another constraint on the image size: the width and the height have to be divisible by the ratio in which the network downsamples the input.
@@ -42,14 +39,12 @@ During training, the model expects both the image tensors and a list of targets.
 - *boxes*: `(x1, y1, x2, y2)` coordinates of the ground-truth boxes in a matrix with shape `[N, 4]`.
 - *labels*: Either integer class labels in a vector of size `N` or a class mask for each ground-truth box in a boolean matrix with shape `[N, classes]`
 
-
 ## Training
 
 The command line application demonstrates how to train a YOLO model using PyTorch Lightning. The first step is to create a network, either from a Darknet configuration file, or using one of the included PyTorch networks. The network is passed to the YOLO model constructor.
 
 The data module needs to resize the data to a suitable size, in addition to any augmenting transforms. For example, YOLOv4 network requires that the width and the height are multiples of 32.
 
-
 ## Inference
 
 During inference, the model requires only the input images. `forward()` method receives a mini-batch of images in a tensor with shape `[N, channels, height, width]`.

From d57055a2b57a3987db11bc8769272da7511d1969 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 7 Nov 2022 17:35:41 +0200
Subject: [PATCH 50/76] Avoid using a lambda function

---
 pl_bolts/models/detection/yolo/loss.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index e76a12f60c..f5704ae0c8 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -68,8 +68,12 @@ def _get_iou_and_loss_functions(name: str) -> Tuple[Callable, Callable]:
         raise ValueError(f"The IoU function '{name}' is not supported by the installed version of Torchvision.")
 
     if not callable(loss_func):
-        loss_func = lambda boxes1, boxes2: 1.0 - iou_func(boxes1, boxes2).diagonal()
 
+        def loss_func(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+            return 1.0 - iou_func(boxes1, boxes2).diagonal()
+
+    assert callable(iou_func)
+    assert callable(loss_func)
     return iou_func, loss_func
 
 

From 9e0a33f3cb0db7751f8f2c5cb5a2a7249af27afd Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 7 Nov 2022 18:39:25 +0200
Subject: [PATCH 51/76] Avoid local functions

---
 pl_bolts/models/detection/yolo/loss.py | 42 +++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index f5704ae0c8..48535028a7 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -39,6 +39,25 @@
         complete_box_iou_loss = None
 
 
+box_iou_loss = lambda boxes1, boxes2: 1.0 - box_iou(boxes1, boxes2).diagonal()
+
+if (generalized_box_iou_loss is None) and (generalized_box_iou is not None):
+    generalized_box_iou_loss = lambda boxes1, boxes2: 1.0 - generalized_box_iou(boxes1, boxes2).diagonal()
+
+if (distance_box_iou_loss is None) and (distance_box_iou is not None):
+    distance_box_iou_loss = lambda boxes1, boxes2: 1.0 - distance_box_iou(boxes1, boxes2).diagonal()
+
+if (complete_box_iou_loss is None) and (complete_box_iou is not None):
+    complete_box_iou_loss = lambda boxes1, boxes2: 1.0 - complete_box_iou(boxes1, boxes2).diagonal()
+
+_iou_and_loss_functions = {
+    "iou": (box_iou, box_iou_loss),
+    "giou": (generalized_box_iou, generalized_box_iou_loss),
+    "diou": (distance_box_iou, distance_box_iou_loss),
+    "ciou": (complete_box_iou, complete_box_iou_loss),
+}
+
+
 def _get_iou_and_loss_functions(name: str) -> Tuple[Callable, Callable]:
     """Returns functions for calculating the IoU and the IoU loss, given the IoU variant name.
 
@@ -49,30 +68,11 @@ def _get_iou_and_loss_functions(name: str) -> Tuple[Callable, Callable]:
         A tuple of two functions. The first function calculates the pairwise IoU and the second function calculates the
         elementwise loss.
     """
-    if name == "iou":
-        iou_func = box_iou
-        loss_func = None
-    elif name == "giou":
-        iou_func = generalized_box_iou
-        loss_func = generalized_box_iou_loss
-    elif name == "diou":
-        iou_func = distance_box_iou
-        loss_func = distance_box_iou_loss
-    elif name == "ciou":
-        iou_func = complete_box_iou
-        loss_func = complete_box_iou_loss
-    else:
+    if name not in _iou_and_loss_functions:
         raise ValueError(f"Unknown IoU function '{name}'.")
-
+    iou_func, loss_func = _iou_and_loss_functions[name]
     if not callable(iou_func):
         raise ValueError(f"The IoU function '{name}' is not supported by the installed version of Torchvision.")
-
-    if not callable(loss_func):
-
-        def loss_func(boxes1: Tensor, boxes2: Tensor) -> Tensor:
-            return 1.0 - iou_func(boxes1, boxes2).diagonal()
-
-    assert callable(iou_func)
     assert callable(loss_func)
     return iou_func, loss_func
 

From fa8ad41766491e10ad86be1e6beae4e1dafe77ca Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 7 Nov 2022 19:18:40 +0200
Subject: [PATCH 52/76] Avoid lambda functions

---
 pl_bolts/models/detection/yolo/loss.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index 48535028a7..034f962637 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -42,13 +42,22 @@
 box_iou_loss = lambda boxes1, boxes2: 1.0 - box_iou(boxes1, boxes2).diagonal()
 
 if (generalized_box_iou_loss is None) and (generalized_box_iou is not None):
-    generalized_box_iou_loss = lambda boxes1, boxes2: 1.0 - generalized_box_iou(boxes1, boxes2).diagonal()
+
+    def generalized_box_iou_loss(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+        return 1.0 - generalized_box_iou(boxes1, boxes2).diagonal()
+
 
 if (distance_box_iou_loss is None) and (distance_box_iou is not None):
-    distance_box_iou_loss = lambda boxes1, boxes2: 1.0 - distance_box_iou(boxes1, boxes2).diagonal()
+
+    def distance_box_iou_loss(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+        return 1.0 - distance_box_iou(boxes1, boxes2).diagonal()
+
 
 if (complete_box_iou_loss is None) and (complete_box_iou is not None):
-    complete_box_iou_loss = lambda boxes1, boxes2: 1.0 - complete_box_iou(boxes1, boxes2).diagonal()
+
+    def complete_box_iou_loss(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+        return 1.0 - complete_box_iou(boxes1, boxes2).diagonal()
+
 
 _iou_and_loss_functions = {
     "iou": (box_iou, box_iou_loss),

From 54f1eb03e7a1e027bc9dcbd8107a68fb3d8f8bba Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 7 Nov 2022 21:04:42 +0200
Subject: [PATCH 53/76] Avoid a lambda function

---
 pl_bolts/models/detection/yolo/loss.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index 034f962637..fea1a916ea 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -39,7 +39,9 @@
         complete_box_iou_loss = None
 
 
-box_iou_loss = lambda boxes1, boxes2: 1.0 - box_iou(boxes1, boxes2).diagonal()
+def box_iou_loss(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    return 1.0 - box_iou(boxes1, boxes2).diagonal()
+
 
 if (generalized_box_iou_loss is None) and (generalized_box_iou is not None):
 

From b95caf2613b689ab4d980799869afafa263b8eb9 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 15 Dec 2022 18:28:10 +0200
Subject: [PATCH 54/76] Use sync_dist=True and don't fail if there are no step
 outputs

---
 pl_bolts/models/detection/yolo/yolo_module.py | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 9f270ebd25..34debba3d1 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -202,7 +202,7 @@ def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Te
         total_hits = sum(hits)
         for layer_idx, layer_hits in enumerate(hits):
             hit_rate: Union[Tensor, float] = torch.true_divide(layer_hits, total_hits) if total_hits > 0 else 1.0
-            self.log(f"layer_{layer_idx}_hit_rate", hit_rate, sync_dist=False, batch_size=images.size(0))
+            self.log(f"layer_{layer_idx}_hit_rate", hit_rate, sync_dist=True, batch_size=images.size(0))
 
         losses = torch.stack(losses).sum(0)
         return detections, losses
@@ -251,12 +251,10 @@ def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) ->
         images, targets = validate_batch(batch)
         _, losses = self(images, targets)
 
-        # sync_dist=True is broken in some versions of Lightning and may cause the sum of the loss
-        # across GPUs to be returned.
-        self.log("train/overlap_loss", losses[0], prog_bar=True, sync_dist=False)
-        self.log("train/confidence_loss", losses[1], prog_bar=True, sync_dist=False)
-        self.log("train/class_loss", losses[2], prog_bar=True, sync_dist=False)
-        self.log("train/total_loss", losses.sum(), sync_dist=False)
+        self.log("train/overlap_loss", losses[0], prog_bar=True, sync_dist=True)
+        self.log("train/confidence_loss", losses[1], prog_bar=True, sync_dist=True)
+        self.log("train/class_loss", losses[2], prog_bar=True, sync_dist=True)
+        self.log("train/total_loss", losses.sum(), sync_dist=True)
 
         return {"loss": losses.sum()}
 
@@ -282,6 +280,11 @@ def validation_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -
             self._val_map.update(detections, targets)
 
     def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
+        # When continuing training from a checkpoint, it may happen that epoch_end is called without outputs. In this
+        # case the metrics cannot be computed.
+        if not outputs:
+            return
+
         if _MEAN_AVERAGE_PRECISION_AVAILABLE:
             map_scores = self._val_map.compute()
             map_scores = {"val/" + k: v for k, v in map_scores.items()}
@@ -310,6 +313,11 @@ def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Opti
             self._test_map.update(detections, targets)
 
     def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
+        # When continuing training from a checkpoint, it may happen that epoch_end is called without outputs. In this
+        # case the metrics cannot be computed.
+        if not outputs:
+            return
+
         if _MEAN_AVERAGE_PRECISION_AVAILABLE:
             map_scores = self._test_map.compute()
             map_scores = {"test/" + k: v for k, v in map_scores.items()}
@@ -531,7 +539,7 @@ def __init__(
 
 
 class ResizedVOCDetectionDataModule(VOCDetectionDataModule):
-    """A subclass of VOCDetectionDataModule that resizes the images to a specific size. YOLO expectes the image
+    """A subclass of ``VOCDetectionDataModule`` that resizes the images to a specific size. YOLO expectes the image
     size to be divisible by the ratio in which the network downsamples the image.
 
     Args:

From e4cb505c4038b0eaf3f115c4b2c8ca9c9c21604e Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 23 Jan 2023 13:21:31 +0200
Subject: [PATCH 55/76] Added documentation

---
 pl_bolts/models/detection/yolo/yolo_module.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 34debba3d1..543e48e7bb 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -211,7 +211,9 @@ def configure_optimizers(self) -> Tuple[List[optim.Optimizer], List[optim.lr_sch
         """Constructs the optimizer and learning rate scheduler based on ``self.optimizer_params`` and
         ``self.lr_scheduler_params``.
 
-        If weight decay is specified, it will be applied only to convolutional layer weights.
+        If weight decay is specified, it will be applied only to convolutional layer weights, as they contain much more
+        parameters than the biases and batch normalization parameters. Regularizing all parameters could lead to
+        underfitting.
         """
         if ("weight_decay" in self.optimizer_params) and (self.optimizer_params["weight_decay"] != 0):
             defaults = copy(self.optimizer_params)

From 606acb1f423e66bb926e77b83a2caf8e7c657127 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 1 Feb 2023 10:25:51 +0200
Subject: [PATCH 56/76] Fixed an off-by-one bug when reading YOLOv4 backbone
 depths

---
 .../models/detection/yolo/torch_networks.py   | 37 ++++++++++---------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index b82d7a1bbb..2a65ac98d8 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -49,11 +49,11 @@ def forward(self, x: Tensor) -> Tensor:
         return x + y if self.shortcut else y
 
 
-class TinyBlock(nn.Module):
+class TinyStage(nn.Module):
     """One stage of the "tiny" network architecture from YOLOv4.
 
     Args:
-        num_channels: Number of channels in the input and output of the block.
+        num_channels: Number of channels in the input and output of the stage.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -79,7 +79,7 @@ def forward(self, x: Tensor) -> Tensor:
         return self.mix(torch.cat((y2, y1), dim=1))
 
 
-class CSPBlock(nn.Module):
+class CSPStage(nn.Module):
     """One stage of a Cross Stage Partial Network (CSPNet).
 
     Encapsulates a number of bottleneck blocks in the "fusion first" CSP structure.
@@ -87,9 +87,9 @@ class CSPBlock(nn.Module):
     `Chien-Yao Wang et al. <https://arxiv.org/abs/1911.11929>`_
 
     Args:
-        in_channels: Number of input channels that the CSP block expects.
-        out_channels: Number of output channels that the CSP block produces.
-        depth: Number of bottleneck blocks that the CSP block contains.
+        in_channels: Number of input channels that the CSP stage expects.
+        out_channels: Number of output channels that the CSP stage produces.
+        depth: Number of bottleneck blocks that the CSP stage contains.
         shortcut: Whether the bottleneck blocks should include a shortcut connection.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
@@ -201,11 +201,11 @@ def maxpool(out_channels: int) -> nn.Module:
 
         self.stage1 = Conv(in_channels, width, kernel_size=3, stride=2, activation=activation, norm=normalization)
         self.downsample2 = downsample(width, width * 2)
-        self.stage2 = TinyBlock(width * 2, activation=activation, norm=normalization)
+        self.stage2 = TinyStage(width * 2, activation=activation, norm=normalization)
         self.downsample3 = maxpool(width * 4)
-        self.stage3 = TinyBlock(width * 4, activation=activation, norm=normalization)
+        self.stage3 = TinyStage(width * 4, activation=activation, norm=normalization)
         self.downsample4 = maxpool(width * 8)
-        self.stage4 = TinyBlock(width * 8, activation=activation, norm=normalization)
+        self.stage4 = TinyStage(width * 8, activation=activation, norm=normalization)
         self.downsample5 = maxpool(width * 16)
 
     def forward(self, x: Tensor) -> List[Tensor]:
@@ -224,7 +224,7 @@ def forward(self, x: Tensor) -> List[Tensor]:
 
 
 class YOLOV4Backbone(nn.Module):
-    """A backbone that approximately corresponds to the Cross Stage Partial Network from YOLOv4.
+    """A backbone that corresponds approximately to the Cross Stage Partial Network from YOLOv4.
 
     Args:
         in_channels: Number of channels in the input image.
@@ -257,7 +257,7 @@ def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
 
         def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
-            csp = CSPBlock(
+            csp = CSPStage(
                 out_channels,
                 out_channels,
                 depth=depth,
@@ -276,7 +276,10 @@ def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
 
         convs = [conv3x3(in_channels, widths[0])] + [conv3x3(widths[0], widths[0]) for _ in range(depths[0] - 1)]
         self.stem = nn.Sequential(*convs)
-        self.stages = nn.ModuleList(stage(widths[n], widths[n + 1], depth) for n, depth in enumerate(depths[:-1]))
+        self.stages = nn.ModuleList(
+            stage(in_channels, out_channels, depth)
+            for in_channels, out_channels, depth in zip(widths[:-1], widths[1:], depths[1:])
+        )
 
     def forward(self, x: Tensor) -> List[Tensor]:
         x = self.stem(x)
@@ -318,7 +321,7 @@ def downsample(in_channels: int, out_channels: int, kernel_size: int = 3) -> nn.
             )
 
         def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
-            csp = CSPBlock(
+            csp = CSPStage(
                 out_channels,
                 out_channels,
                 depth=depth,
@@ -571,7 +574,7 @@ def conv(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
 
         def csp(in_channels: int, out_channels: int) -> nn.Module:
-            return CSPBlock(
+            return CSPStage(
                 in_channels,
                 out_channels,
                 depth=2,
@@ -754,7 +757,7 @@ def conv(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
 
         def csp(in_channels: int, out_channels: int) -> nn.Module:
-            return CSPBlock(
+            return CSPStage(
                 in_channels,
                 out_channels,
                 depth=2,
@@ -965,7 +968,7 @@ def out(in_channels: int) -> nn.Module:
             return nn.Sequential(OrderedDict([(f"outputs_{num_outputs}", outputs)]))
 
         def csp(in_channels: int, out_channels: int) -> nn.Module:
-            return CSPBlock(
+            return CSPStage(
                 in_channels,
                 out_channels,
                 depth=depth,
@@ -1185,7 +1188,7 @@ def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module
             return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
 
         def csp(in_channels: int, out_channels: int) -> nn.Module:
-            return CSPBlock(
+            return CSPStage(
                 in_channels,
                 out_channels,
                 depth=depth,

From b95f02620213fd8f8ecc4ca7a272157431d9517a Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Fri, 24 Feb 2023 16:22:22 +0200
Subject: [PATCH 57/76] YOLOv7 network with deep supervision

---
 pl_bolts/models/detection/__init__.py         |   4 +
 .../models/detection/yolo/darknet_network.py  |   7 +-
 pl_bolts/models/detection/yolo/layers.py      | 117 +++-
 .../models/detection/yolo/target_matching.py  |  80 +--
 .../models/detection/yolo/torch_networks.py   | 615 +++++++++++++++---
 pl_bolts/models/detection/yolo/yolo_module.py |   4 +-
 6 files changed, 650 insertions(+), 177 deletions(-)

diff --git a/pl_bolts/models/detection/__init__.py b/pl_bolts/models/detection/__init__.py
index aa90b09437..c2ac244d63 100644
--- a/pl_bolts/models/detection/__init__.py
+++ b/pl_bolts/models/detection/__init__.py
@@ -10,6 +10,8 @@
     YOLOV4TinyNetwork,
     YOLOV5Backbone,
     YOLOV5Network,
+    YOLOV7Backbone,
+    YOLOV7Network,
     YOLOXNetwork,
 )
 from pl_bolts.models.detection.yolo.yolo_module import YOLO
@@ -26,6 +28,8 @@
     "YOLOV4TinyNetwork",
     "YOLOV5Backbone",
     "YOLOV5Network",
+    "YOLOV7Backbone",
+    "YOLOV7Network",
     "YOLOXNetwork",
     "YOLO",
 ]
diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index 08eaa93e25..aff197b36c 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -101,11 +101,12 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
             if isinstance(layer, (layers.RouteLayer, layers.ShortcutLayer)):
                 x = layer(outputs)
             elif isinstance(layer, layers.DetectionLayer):
-                x = layer(x, image_size, targets)
+                x, preds = layer(x, image_size)
                 detections.append(x)
                 if targets is not None:
-                    losses.append(layer.losses)
-                    hits.append(layer.hits)
+                    layer_losses, layer_hits = layer.calculate_losses(preds, targets, image_size)
+                    losses.append(layer_losses)
+                    hits.append(layer_hits)
             else:
                 x = layer(x)
 
diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index 2170b3098a..9abce36e22 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -65,7 +65,8 @@ class DetectionLayer(nn.Module):
             to produce coordinate values close to one.
         input_is_normalized: The input is normalized by logistic activation in the previous layer. In this case the
             detection layer will not take the sigmoid of the coordinate and probability predictions, and the width and
-            height are scaled up so that the maximum value is four times the anchor dimension.
+            height are scaled up so that the maximum value is four times the anchor dimension. This is used by the
+            Darknet configurations of Scaled-YOLOv4.
     """
 
     def __init__(
@@ -89,7 +90,7 @@ def __init__(
         self.xy_scale = xy_scale
         self.input_is_normalized = input_is_normalized
 
-    def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str, Tensor]]] = None) -> Tensor:
+    def forward(self, x: Tensor, image_size: Tensor) -> Tuple[Tensor, List[Dict[str, Tensor]]]:
         """Runs a forward pass through this YOLO detection layer.
 
         Maps cell-local coordinates to global coordinates in the image space, scales the bounding boxes with the
@@ -102,14 +103,14 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         for. ``losses`` is a tensor of three elements: the overlap, confidence, and classification loss.
 
         Args:
-            x: The output from the previous layer. Tensor of size
+            x: The output from the previous layer. The size of this tensor has to be
                 ``[batch_size, anchors_per_cell * (num_classes + 5), height, width]``.
             image_size: Image width and height in a vector (defines the scale of the predicted and target coordinates).
-            targets: If set, computes losses from detection layers against these targets. A list of target dictionaries,
-                one for each image.
 
         Returns:
-            Layer output tensor, sized ``[batch_size, num_anchors * height * width, num_classes + 5]``.
+            The layer output, with normalized probabilities, in a tensor sized
+            ``[batch_size, anchors_per_cell * height * width, num_classes + 5]`` and a list of dictionaries, containing
+            the same predictions, but with unnormalized probabilities (for loss calculation).
         """
         batch_size, num_features, height, width = x.shape
         num_attrs = self.num_classes + 5
@@ -150,37 +151,53 @@ def forward(self, x: Tensor, image_size: Tensor, targets: Optional[List[Dict[str
         output = torch.cat((box, norm_confidence.unsqueeze(-1), norm_classprob), -1)
         output = output.reshape(batch_size, height * width * anchors_per_cell, num_attrs)
 
-        if targets is not None:
-            # We want to use binary_cross_entropy_with_logits, so we'll use the unnormalized confidence and classprob,
-            # if possible.
-            preds = [{"boxes": b, "confidences": c, "classprobs": p} for b, c, p in zip(box, confidence, classprob)]
-            self._calculate_losses(preds, targets, image_size)
+        # It's better to use binary_cross_entropy_with_logits() for loss computation, so we'll provide the unnormalized
+        # confidence and classprob, when available.
+        preds = [{"boxes": b, "confidences": c, "classprobs": p} for b, c, p in zip(box, confidence, classprob)]
 
-        return output
+        return output, preds
 
-    def _calculate_losses(
+    def match_targets(
         self,
         preds: List[Dict[str, Tensor]],
+        return_preds: List[Dict[str, Tensor]],
         targets: List[Dict[str, Tensor]],
         image_size: Tensor,
-    ) -> None:
-        """Matches the predictions to targets and calculates the losses. Creates the attributes ``losses`` and
-        ``hits``. ``losses`` is a tensor of three elements: the overlap, confidence, and classification loss.
-        ``hits`` is the number of targets that this layer was responsible for.
+    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
+        """Matches the predictions to targets.
 
         Args:
-            preds: List of predictions for each image.
+            preds: List of predictions for each image, as returned by the ``forward()`` method of this layer. These will
+                be matched to the training targets.
+            return_preds: List of predictions for each image. The matched predictions will be returned from this list.
+                When calculating the auxiliary loss for deep supervision, predictions from a different layer are used
+                for loss computation.
             targets: List of training targets for each image.
             image_size: Width and height in a vector that defines the scale of the target coordinates.
+
+        Returns:
+            Two dictionaries, the matched predictions and targets.
         """
         batch_size = len(preds)
-        if batch_size != len(targets):
+        if (len(targets) != batch_size) or (len(return_preds) != batch_size):
             raise ValueError("Different batch size for predictions and targets.")
 
         matches = []
-        for image_preds, image_targets in zip(preds, targets):
+        for image_preds, image_return_preds, image_targets in zip(preds, return_preds, targets):
             if image_targets["boxes"].shape[0] > 0:
-                matched_preds, matched_targets = self.matching_func(image_preds, image_targets, image_size)
+                pred_selector, background_selector, target_selector = self.matching_func(
+                    image_preds, image_targets, image_size
+                )
+                matched_preds = {
+                    "boxes": image_return_preds["boxes"][pred_selector],
+                    "confidences": image_return_preds["confidences"][pred_selector],
+                    "bg_confidences": image_return_preds["confidences"][background_selector],
+                    "classprobs": image_return_preds["classprobs"][pred_selector],
+                }
+                matched_targets = {
+                    "boxes": image_targets["boxes"][target_selector],
+                    "labels": image_targets["labels"][target_selector],
+                }
             else:
                 device = image_preds["confidences"].device
                 matched_preds = {
@@ -205,9 +222,41 @@ def _calculate_losses(
             "boxes": torch.cat(tuple(m[1]["boxes"] for m in matches)),
             "labels": torch.cat(tuple(m[1]["labels"] for m in matches)),
         }
+        return matched_preds, matched_targets
+
+    def calculate_losses(
+        self,
+        preds: List[Dict[str, Tensor]],
+        targets: List[Dict[str, Tensor]],
+        image_size: Tensor,
+        loss_preds: Optional[List[Dict[str, Tensor]]] = None,
+    ) -> Tuple[Tensor, int]:
+        """Matches the predictions to targets and computes the losses.
+
+        Args:
+            preds: List of predictions for each image, as returned by ``forward()``. These will be matched to the
+                training targets and used to compute the losses (unless another set of predictions for loss computation
+                is given in ``loss_preds``).
+            targets: List of training targets for each image.
+            image_size: Width and height in a vector that defines the scale of the target coordinates.
+            loss_preds: List of predictions for each image. If given, these will be used for loss computation, instead
+                of the same predictions that were used for matching. This is needed for deep supervision in YOLOv7.
+
+        Returns:
+            A vector of the overlap, confidence, and classification loss, normalized by batch size, and the number of
+            targets that were matched to this layer.
+        """
+        if loss_preds is None:
+            loss_preds = preds
+
+        matched_preds, matched_targets = self.match_targets(preds, loss_preds, targets, image_size)
+
         losses = self.loss_func.elementwise_sums(matched_preds, matched_targets, self.input_is_normalized, image_size)
-        self.losses = torch.stack((losses.overlap, losses.confidence, losses.classification)) / batch_size
-        self.hits = len(matched_targets["boxes"])
+        losses = torch.stack((losses.overlap, losses.confidence, losses.classification)) / len(preds)
+
+        hits = len(matched_targets["boxes"])
+
+        return losses, hits
 
 
 class Conv(nn.Module):
@@ -316,6 +365,20 @@ def forward(self, x: Tensor) -> Tensor:
         return x * torch.tanh(nn.functional.softplus(x))
 
 
+class ReOrg(nn.Module):
+    """Re-organizes the tensor so that every square region of four cells is placed into four different channels.
+
+    The result is a tensor with half the width and height, and four times as many channels.
+    """
+
+    def forward(self, x):
+        tl = x[..., ::2, ::2]
+        bl = x[..., 1::2, ::2]
+        tr = x[..., ::2, 1::2]
+        br = x[..., 1::2, 1::2]
+        return torch.cat((tl, bl, tr, br), dim=1)
+
+
 def create_activation_module(name: Optional[str]) -> nn.Module:
     """Creates a layer activation module given its type as a string.
 
@@ -361,6 +424,7 @@ def create_detection_layer(
     prior_shape_idxs: Sequence[int],
     matching_algorithm: Optional[str] = None,
     matching_threshold: Optional[float] = None,
+    sim_ota_range: float = 5.0,
     ignore_bg_threshold: float = 0.7,
     overlap_func: Union[str, Callable] = "ciou",
     predict_overlap: float = 1.0,
@@ -382,6 +446,8 @@ def create_detection_layer(
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        sim_ota_range: The "simota" matching algorithm will restrict to the anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
@@ -399,14 +465,15 @@ def create_detection_layer(
             to produce coordinate values close to one.
         input_is_normalized: The input is normalized by logistic activation in the previous layer. In this case the
             detection layer will not take the sigmoid of the coordinate and probability predictions, and the width and
-            height are scaled up so that the maximum value is four times the anchor dimension.
+            height are scaled up so that the maximum value is four times the anchor dimension. This is used by the
+            Darknet configurations of Scaled-YOLOv4.
     """
     matching_func: Union[ShapeMatching, SimOTAMatching]
     if matching_algorithm == "simota":
         loss_func = LossFunction(
             overlap_func, None, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
         )
-        matching_func = SimOTAMatching(loss_func)
+        matching_func = SimOTAMatching(loss_func, sim_ota_range)
     elif matching_algorithm == "size":
         if matching_threshold is None:
             raise ValueError("matching_threshold is required with size ratio matching.")
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index 3dcf552c19..d9b584eaa4 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -51,7 +51,7 @@ def __call__(
             image_size: Input image width and height.
 
         Returns:
-            preds, targets: Two dictionaries that contain the matched predictions and targets.
+            The indices of the matched predictions, background mask, and a mask for selecting the matched targets.
         """
         height, width = preds["boxes"].shape[:2]
         device = preds["boxes"].device
@@ -67,27 +67,19 @@ def __call__(
         cell_i = grid_xy[:, 0].to(torch.int64).clamp(0, width - 1)
         cell_j = grid_xy[:, 1].to(torch.int64).clamp(0, height - 1)
 
-        matched_targets, matched_anchors = self.match(xywh[:, 2:])
-        cell_i = cell_i[matched_targets]
-        cell_j = cell_j[matched_targets]
+        target_selector, anchor_selector = self.match(xywh[:, 2:])
+        cell_i = cell_i[target_selector]
+        cell_j = cell_j[target_selector]
 
         # Background mask is used to select anchors that are not responsible for predicting any object, for
         # calculating the part of the confidence loss with zero as the target confidence. It is set to False, if a
         # predicted box overlaps any target significantly, or if a prediction is matched to a target.
         background_mask = iou_below(preds["boxes"], targets["boxes"], self.ignore_bg_threshold)
-        background_mask[cell_j, cell_i, matched_anchors] = False
+        background_mask[cell_j, cell_i, anchor_selector] = False
 
-        preds = {
-            "boxes": preds["boxes"][cell_j, cell_i, matched_anchors],
-            "confidences": preds["confidences"][cell_j, cell_i, matched_anchors],
-            "bg_confidences": preds["confidences"][background_mask],
-            "classprobs": preds["classprobs"][cell_j, cell_i, matched_anchors],
-        }
-        targets = {
-            "boxes": targets["boxes"][matched_targets],
-            "labels": targets["labels"][matched_targets],
-        }
-        return preds, targets
+        pred_selector = [cell_j, cell_i, anchor_selector]
+
+        return pred_selector, background_mask, target_selector
 
     @abstractmethod
     def match(self, wh: Tensor) -> Union[Tuple[Tensor, Tensor], Tensor]:
@@ -247,9 +239,9 @@ def _sim_ota_match(costs: Tensor, ious: Tensor) -> Tuple[Tensor, Tensor]:
         matching_matrix[best_targets, more_than_one_match] = True
 
     # For those predictions that were matched, get the index of the target.
-    matched_preds = matching_matrix.sum(0) > 0
-    matched_targets = matching_matrix[:, matched_preds].int().argmax(0)
-    return matched_preds, matched_targets
+    pred_mask = matching_matrix.sum(0) > 0
+    target_selector = matching_matrix[:, pred_mask].int().argmax(0)
+    return pred_mask, target_selector
 
 
 class SimOTAMatching:
@@ -259,10 +251,13 @@ class SimOTAMatching:
 
     Args:
         loss_func: A ``LossFunction`` object that can be used to calculate the pairwise costs.
+        range: For each target, restrict to the anchors that are within an `N x N` grid cell are centered at the target,
+            where `N` is the value of this parameter.
     """
 
-    def __init__(self, loss_func: LossFunction) -> None:
+    def __init__(self, loss_func: LossFunction, range: float = 5.0) -> None:
         self.loss_func = loss_func
+        self.range = range
 
     def __call__(
         self,
@@ -278,7 +273,8 @@ def __call__(
             image_size: Input image width and height.
 
         Returns:
-            preds, targets: Two dictionaries that contain the matched predictions and targets.
+            A mask of predictions that were matched, background mask (inverse of the first mask), and the indices of the
+            matched targets. The last tensor contains as many elements as there are ``True`` values in the first mask.
         """
         height, width, boxes_per_cell, num_classes = preds["classprobs"].shape
         device = preds["boxes"].device
@@ -291,42 +287,36 @@ def __call__(
         centers = grid_centers(grid_size).view(-1, 2) * grid_to_image
         inside_matrix = is_inside_box(centers, targets["boxes"])
 
-        # Set the width and height of all target bounding boxes to the size of 5 grid cells and create a matrix for
+        # Set the width and height of all target bounding boxes to self.range grid cells and create a matrix for
         # selecting the anchors that are now inside the boxes. If a small target has no anchors inside its bounding
         # box, it will be matched to one of these anchors, but a high penalty will ensure that anchors that are inside
         # the bounding box will be preferred.
         xywh = box_convert(targets["boxes"], in_fmt="xyxy", out_fmt="cxcywh")
         xy = xywh[:, :2]
-        wh = 5.0 * grid_to_image * torch.ones_like(xy)
+        wh = self.range * grid_to_image * torch.ones_like(xy)
         xywh = torch.cat((xy, wh), -1)
         boxes = box_convert(xywh, in_fmt="cxcywh", out_fmt="xyxy")
         close_matrix = is_inside_box(centers, boxes)
 
-        # Flatten the prediction grids and filter them using a [height*width] boolean vector that indicates whether a
-        # cell center is inside or close enough to one or more targets.
-        fg_mask = (inside_matrix | close_matrix).sum(0) > 0
-        bg_mask = torch.logical_not(fg_mask)
+        # In the first step we restrict ourselves to the grid cells whose center is inside or close enough to one or
+        # more targets. The prediction grids are flattened and masked using a [height * width] boolean vector.
+        mask = (inside_matrix | close_matrix).sum(0) > 0
         shape = (height * width, boxes_per_cell)
         fg_preds = {
-            "boxes": preds["boxes"].view(*shape, 4)[fg_mask].view(-1, 4),
-            "confidences": preds["confidences"].view(shape)[fg_mask].view(-1),
-            "classprobs": preds["classprobs"].view(*shape, num_classes)[fg_mask].view(-1, num_classes),
+            "boxes": preds["boxes"].view(*shape, 4)[mask].view(-1, 4),
+            "confidences": preds["confidences"].view(shape)[mask].view(-1),
+            "classprobs": preds["classprobs"].view(*shape, num_classes)[mask].view(-1, num_classes),
         }
-        bg_confidences = preds["confidences"].view(shape)[bg_mask].view(-1)
 
         losses, ious = self.loss_func.pairwise(fg_preds, targets, input_is_normalized=False)
         costs = losses.overlap + losses.confidence + losses.classification
-        costs += 100000.0 * ~inside_matrix[:, fg_mask].repeat_interleave(boxes_per_cell, 1)
-        matched_preds, matched_targets = _sim_ota_match(costs, ious)
-
-        preds = {
-            "boxes": fg_preds["boxes"][matched_preds],
-            "confidences": fg_preds["confidences"][matched_preds],
-            "bg_confidences": torch.cat((bg_confidences, fg_preds["confidences"][torch.logical_not(matched_preds)])),
-            "classprobs": fg_preds["classprobs"][matched_preds],
-        }
-        targets = {
-            "boxes": targets["boxes"][matched_targets],
-            "labels": targets["labels"][matched_targets],
-        }
-        return preds, targets
+        costs += 100000.0 * ~inside_matrix[:, mask].repeat_interleave(boxes_per_cell, 1)
+        pred_mask, target_selector = _sim_ota_match(costs, ious)
+
+        # Add the anchor dimension to the mask and replace True values with the results of the actual SimOTA matching.
+        mask = mask.view(height, width).unsqueeze(-1).repeat(1, 1, boxes_per_cell)
+        mask[mask.nonzero().T.tolist()] = pred_mask
+
+        background_mask = torch.logical_not(mask)
+
+        return mask, background_mask, target_selector
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index 2a65ac98d8..ea81fee2cb 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -1,15 +1,70 @@
 from collections import OrderedDict
-from typing import Any, List, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import torch
 import torch.nn as nn
 from torch import Tensor
 
-from pl_bolts.models.detection.yolo.layers import Conv, DetectionLayer, MaxPool, create_detection_layer
+from pl_bolts.models.detection.yolo.layers import Conv, DetectionLayer, MaxPool, ReOrg, create_detection_layer
 from pl_bolts.models.detection.yolo.types import NETWORK_OUTPUT, TARGETS
 from pl_bolts.models.detection.yolo.utils import get_image_size
 
 
+def run_detection(
+    detection_layer: DetectionLayer,
+    input: Tensor,
+    targets: List[Dict[str, Tensor]],
+    image_size: Tensor,
+    detections: List[Tensor],
+    losses: List[Tensor],
+    hits: List[int],
+):
+    """Runs the detection layer on the inputs and appends the output to the ``detections`` list.
+
+    If ``targets`` is given, also calculates the losses and appends to the ``losses`` list.
+    """
+    output, preds = detection_layer(input, image_size)
+    detections.append(output)
+
+    if targets is not None:
+        layer_losses, layer_hits = detection_layer.calculate_losses(preds, targets, image_size)
+        losses.append(layer_losses)
+        hits.append(layer_hits)
+
+
+def run_detection_with_aux_head(
+    detection_layer: DetectionLayer,
+    aux_detection_layer: DetectionLayer,
+    input: Tensor,
+    aux_input: Tensor,
+    targets: List[Dict[str, Tensor]],
+    image_size: Tensor,
+    aux_weight: float,
+    detections: List[Tensor],
+    losses: List[Tensor],
+    hits: List[int],
+):
+    """Runs the detection layer on the inputs and appends the output to the ``detections`` list.
+
+    If ``targets`` is given, also runs the auxiliary detection layer on the auxiliary inputs, calculates the losses, and
+    appends the losses to the ``losses`` list.
+    """
+    output, preds = detection_layer(input, image_size)
+    detections.append(output)
+
+    if targets is not None:
+        # Match lead head predictions to targets and calculate losses from lead head outputs.
+        layer_losses, layer_hits = detection_layer.calculate_losses(preds, targets, image_size)
+        losses.append(layer_losses)
+        hits.append(layer_hits)
+
+        # Match lead head predictions to targets and calculate losses from auxiliary head outputs.
+        _, aux_preds = aux_detection_layer(aux_input, image_size)
+        layer_losses, layer_hits = aux_detection_layer.calculate_losses(preds, targets, image_size, loss_preds=aux_preds)
+        losses.append(layer_losses * aux_weight)
+        hits.append(layer_hits)
+
+
 class BottleneckBlock(nn.Module):
     """A residual block with a bottleneck layer.
 
@@ -126,13 +181,136 @@ def forward(self, x: Tensor) -> Tensor:
         return self.mix(torch.cat((y1, y2), dim=1))
 
 
+class ELANStage(nn.Module):
+    """One stage of an Efficient Layer Aggregation Network (ELAN).
+
+    `Chien-Yao Wang et al. <https://arxiv.org/abs/2211.04800>`_
+
+    Args:
+        in_channels: Number of input channels that the ELAN stage expects.
+        out_channels: Number of output channels that the ELAN stage produces.
+        hidden_channels: Number of output channels that the computational blocks produce. The default value is half the
+            number of output channels of the block, as in YOLOv7-W6, but the value varies between the variants.
+        split_channels: Number of channels in each part after splitting the input to the cross stage connection and the
+            computational blocks. The default value is the number of hidden channels, as in all YOLOv7 backbones. Most
+            YOLOv7 heads use twice the number of hidden channels.
+        depth: Number of computational blocks that the ELAN stage contains. The default value is 2. YOLOv7 backbones use
+            2 to 4 blocks per stage.
+        block_depth: Number of convolutional layers in one computational block. The default value is 2. YOLOv7 backbones
+            have two convolutions per block. YOLOv7 heads (except YOLOv7-X) have 2 to 8 blocks with only one convolution
+            in each.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        hidden_channels: Optional[int] = None,
+        split_channels: Optional[int] = None,
+        depth: int = 2,
+        block_depth: int = 2,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
+
+        def block(in_channels: int, out_channels: int) -> nn.Module:
+            convs = [conv3x3(in_channels, out_channels)]
+            for _ in range(block_depth - 1):
+                convs.append(conv3x3(out_channels, out_channels))
+            return nn.Sequential(*convs)
+
+        # Instead of splitting the N output channels of a convolution into two parts, we can equivalently perform two
+        # convolutions with N/2 output channels. However, in many YOLOv7 architectures, the number of hidden channels is
+        # not exactly half the number of output channels.
+        if hidden_channels is None:
+            hidden_channels = out_channels // 2
+
+        if split_channels is None:
+            split_channels = hidden_channels
+
+        self.split1 = Conv(in_channels, split_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.split2 = Conv(in_channels, split_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+        blocks = [block(split_channels, hidden_channels)]
+        for _ in range(depth - 1):
+            blocks.append(block(hidden_channels, hidden_channels))
+        self.blocks = nn.ModuleList(blocks)
+
+        total_channels = (split_channels * 2) + (hidden_channels * depth)
+        self.mix = Conv(total_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = [self.split1(x), self.split2(x)]
+        x = outputs[-1]
+        for block in self.blocks:
+            x = block(x)
+            outputs.append(x)
+        return self.mix(torch.cat(outputs, dim=1))
+
+
+class CSPSPP(nn.Module):
+    """Spatial pyramid pooling module from the Cross Stage Partial Network from YOLOv4.
+
+    Args:
+        in_channels: Number of input channels that the module expects.
+        out_channels: Number of output channels that the module produces.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+
+        def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=kernel_size, stride=1, activation=activation, norm=norm)
+
+        self.conv1 = nn.Sequential(
+            conv(in_channels, out_channels),
+            conv(out_channels, out_channels, kernel_size=3),
+            conv(out_channels, out_channels),
+        )
+        self.conv2 = conv(in_channels, out_channels)
+
+        self.maxpool1 = MaxPool(kernel_size=5, stride=1)
+        self.maxpool2 = MaxPool(kernel_size=9, stride=1)
+        self.maxpool3 = MaxPool(kernel_size=13, stride=1)
+
+        self.mix1 = nn.Sequential(
+            conv(4 * out_channels, out_channels),
+            conv(out_channels, out_channels, kernel_size=3),
+        )
+        self.mix2 = Conv(2 * out_channels, out_channels)
+
+    def forward(self, x):
+        x1 = self.conv1(x)
+        x2 = self.maxpool1(x1)
+        x3 = self.maxpool2(x1)
+        x4 = self.maxpool3(x1)
+        y1 = self.mix1(torch.cat((x1, x2, x3, x4), dim=1))
+        y2 = self.conv2(x)
+        return self.mix2(torch.cat((y1, y2), dim=1))
+
+
 class FastSPP(nn.Module):
-    """Fast spatial pyramid pooling module.
+    """Fast spatial pyramid pooling module from YOLOv5.
 
     Args:
         in_channels: Number of input channels that the module expects.
         out_channels: Number of output channels that the module produces.
-        kernel_size: Kernel size for convolutional layers.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -142,14 +320,13 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: int = 5,
         activation: Optional[str] = "silu",
         norm: Optional[str] = "batchnorm",
     ):
         super().__init__()
         hidden_channels = in_channels // 2
         self.conv = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
-        self.maxpool = MaxPool(kernel_size=kernel_size, stride=1)
+        self.maxpool = MaxPool(kernel_size=5, stride=1)
         self.mix = Conv(hidden_channels * 4, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
 
     def forward(self, x: Tensor) -> Tensor:
@@ -353,6 +530,69 @@ def forward(self, x: Tensor) -> List[Tensor]:
         return [c1, c2, c3, c4, c5]
 
 
+class YOLOV7Backbone(nn.Module):
+    """A backbone that corresponds to the W6 variant of the Efficient Layer Aggregation Network from YOLOv7.
+
+    Args:
+        in_channels: Number of channels in the input image.
+        widths: Number of channels at each network stage. Before the first stage there will be one extra split of
+            spatial resolution by a ``ReOrg`` layer, producing ``in_channels * 4`` channels.
+        depth: Number of computational blocks at each network stage. YOLOv7-W6 backbone uses 2.
+        block_depth: Number of convolutional layers in one computational block. YOLOv7-W6 backbone uses 2.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        widths: Sequence[int] = (64, 128, 256, 512, 768, 1024),
+        depth: int = 2,
+        block_depth: int = 2,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def stage(in_channels: int, out_channels: int) -> nn.Module:
+            elan = ELANStage(
+                out_channels,
+                out_channels,
+                depth=depth,
+                block_depth=block_depth,
+                activation=activation,
+                norm=normalization,
+            )
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("downsample", downsample(in_channels, out_channels)),
+                        ("elan", elan),
+                    ]
+                )
+            )
+
+        self.stem = nn.Sequential(*[ReOrg(), conv3x3(in_channels * 4, widths[0])])
+        self.stages = nn.ModuleList(
+            stage(in_channels, out_channels) for in_channels, out_channels in zip(widths[:-1], widths[1:])
+        )
+
+    def forward(self, x: Tensor) -> List[Tensor]:
+        x = self.stem(x)
+        outputs: List[Tensor] = []
+        for stage in self.stages:
+            x = stage(x)
+            outputs.append(x)
+        return outputs
+
+
 class YOLOV4TinyNetwork(nn.Module):
     """The "tiny" network architecture from YOLOv4.
 
@@ -478,24 +718,9 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         x = torch.cat((self.upsample4(p4), c3), dim=1)
         p3 = self.fpn3(x)
 
-        y = self.detect5(self.out5(p5), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect5.losses)
-            hits.append(self.detect5.hits)
-
-        y = self.detect4(self.out4(p4), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect4.losses)
-            hits.append(self.detect4.hits)
-
-        y = self.detect3(self.out3(p3), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect3.losses)
-            hits.append(self.detect3.hits)
-
+        run_detection(self.detect5, self.out5(p5), targets, image_size, detections, losses, hits)
+        run_detection(self.detect4, self.out4(p4), targets, image_size, detections, losses, hits)
+        run_detection(self.detect3, self.out3(p3), targets, image_size, detections, losses, hits)
         return detections, losses, hits
 
 
@@ -568,7 +793,7 @@ def __init__(
         num_outputs = (5 + num_classes) * anchors_per_cell
 
         def spp(in_channels: int, out_channels: int) -> nn.Module:
-            return FastSPP(in_channels, out_channels, kernel_size=5, activation=activation, norm=normalization)
+            return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def conv(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
@@ -654,24 +879,9 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         x = torch.cat((self.downsample4(n4), c5), dim=1)
         n5 = self.pan5(x)
 
-        y = self.detect3(self.out3(n3), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect3.losses)
-            hits.append(self.detect3.hits)
-
-        y = self.detect4(self.out4(n4), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect4.losses)
-            hits.append(self.detect4.hits)
-
-        y = self.detect5(self.out5(n5), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect5.losses)
-            hits.append(self.detect5.hits)
-
+        run_detection(self.detect3, self.out3(n3), targets, image_size, detections, losses, hits)
+        run_detection(self.detect4, self.out4(n4), targets, image_size, detections, losses, hits)
+        run_detection(self.detect5, self.out5(n5), targets, image_size, detections, losses, hits)
         return detections, losses, hits
 
 
@@ -751,7 +961,7 @@ def __init__(
         num_outputs = (5 + num_classes) * anchors_per_cell
 
         def spp(in_channels: int, out_channels: int) -> nn.Module:
-            return FastSPP(in_channels, out_channels, kernel_size=5, activation=activation, norm=normalization)
+            return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def conv(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
@@ -853,30 +1063,10 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         x = torch.cat((self.downsample5(n5), c6), dim=1)
         n6 = self.pan6(x)
 
-        y = self.detect3(self.out3(n3), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect3.losses)
-            hits.append(self.detect3.hits)
-
-        y = self.detect4(self.out4(n4), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect4.losses)
-            hits.append(self.detect4.hits)
-
-        y = self.detect5(self.out5(n5), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect5.losses)
-            hits.append(self.detect5.hits)
-
-        y = self.detect6(self.out6(n6), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect6.losses)
-            hits.append(self.detect6.hits)
-
+        run_detection(self.detect3, self.out3(n3), targets, image_size, detections, losses, hits)
+        run_detection(self.detect4, self.out4(n4), targets, image_size, detections, losses, hits)
+        run_detection(self.detect5, self.out5(n5), targets, image_size, detections, losses, hits)
+        run_detection(self.detect6, self.out6(n6), targets, image_size, detections, losses, hits)
         return detections, losses, hits
 
 
@@ -955,7 +1145,7 @@ def __init__(
         num_outputs = (5 + num_classes) * anchors_per_cell
 
         def spp(in_channels: int, out_channels: int) -> nn.Module:
-            return FastSPP(in_channels, out_channels, kernel_size=5, activation=activation, norm=normalization)
+            return FastSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
@@ -1037,24 +1227,258 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         x = torch.cat((self.downsample4(n4), p5), dim=1)
         n5 = self.pan5(x)
 
-        y = self.detect3(self.out3(n3), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect3.losses)
-            hits.append(self.detect3.hits)
+        run_detection(self.detect3, self.out3(n3), targets, image_size, detections, losses, hits)
+        run_detection(self.detect4, self.out4(n4), targets, image_size, detections, losses, hits)
+        run_detection(self.detect5, self.out5(n5), targets, image_size, detections, losses, hits)
+        return detections, losses, hits
+
+
+class YOLOV7Network(nn.Module):
+    """Network architecture that corresponds to the W6 variant of YOLOv7 with four detection layers.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        widths: Number of channels at each network stage.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain (width, height) tuples in the network input
+            resolution. There should be `3N` tuples, where `N` defines the number of anchors per spatial location. They
+            are assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning
+            that you typically want to sort the shapes from the smallest to the largest.
+        aux_weight: Weight for the loss from the auxiliary heads.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou" (default).
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        widths: Sequence[int] = (32, 64, 128, 256, 512, 1024, 1024),
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: Optional[List[Tuple[int, int]]] = None,
+        aux_weight: float = 0.25,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        self.aux_weight = aux_weight
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                (13, 17),
+                (31, 25),
+                (24, 51),
+                (61, 45),
+                (61, 45),
+                (48, 102),
+                (119, 96),
+                (97, 189),
+                (97, 189),
+                (217, 184),
+                (171, 384),
+                (324, 451),
+                (324, 451),
+                (545, 357),
+                (616, 618),
+                (1024, 1024),
+            ]
+            anchors_per_cell = 4
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 4)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 4.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
+
+        def conv(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def elan(in_channels: int, out_channels: int) -> nn.Module:
+            return ELANStage(
+                in_channels,
+                out_channels,
+                split_channels=out_channels,
+                depth=4,
+                block_depth=1,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def out(in_channels: int, hidden_channels: int) -> nn.Module:
+            conv = Conv(
+                in_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=normalization
+            )
+            outputs = nn.Conv2d(hidden_channels, num_outputs, kernel_size=1)
+            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+
+        def upsample(in_channels: int, out_channels: int) -> nn.Module:
+            channels = conv(in_channels, out_channels)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def detect(prior_shape_idxs: Sequence[int], range: float) -> DetectionLayer:
+            assert prior_shapes is not None
+            return create_detection_layer(
+                prior_shapes,
+                prior_shape_idxs,
+                sim_ota_range=range,
+                num_classes=num_classes,
+                input_is_normalized=False,
+                **kwargs,
+            )
+
+        if backbone is not None:
+            self.backbone = backbone
+        else:
+            self.backbone = YOLOV7Backbone(
+                widths=widths, depth=2, block_depth=2, activation=activation, normalization=normalization
+            )
+
+        w3 = widths[-4]  # 256
+        w4 = widths[-3]  # 512
+        w5 = widths[-2]  # 768
+        w6 = widths[-1]  # 1024
+
+        self.spp = spp(w6, w6 // 2)
+
+        self.pre5 = conv(w5, w5 // 2)
+        self.upsample6 = upsample(w6 // 2, w5 // 2)
+        self.fpn5 = elan(w5, w5 // 2)
 
-        y = self.detect4(self.out4(n4), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect4.losses)
-            hits.append(self.detect4.hits)
+        self.pre4 = conv(w4, w4 // 2)
+        self.upsample5 = upsample(w5 // 2, w4 // 2)
+        self.fpn4 = elan(w4, w4 // 2)
 
-        y = self.detect5(self.out5(n5), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect5.losses)
-            hits.append(self.detect5.hits)
+        self.pre3 = conv(w3, w3 // 2)
+        self.upsample4 = upsample(w4 // 2, w3 // 2)
+        self.fpn3 = elan(w3, w3 // 2)
+
+        self.downsample3 = downsample(w3 // 2, w4 // 2)
+        self.pan4 = elan(w4, w4 // 2)
+
+        self.downsample4 = downsample(w4 // 2, w5 // 2)
+        self.pan5 = elan(w5, w5 // 2)
+
+        self.downsample5 = downsample(w5 // 2, w6 // 2)
+        self.pan6 = elan(w6, w6 // 2)
+
+        self.out3 = out(w3 // 2, w3)
+        self.aux_out3 = out(w3 // 2, w3 + (w3 // 4))
+        self.out4 = out(w4 // 2, w4)
+        self.aux_out4 = out(w4 // 2, w4 + (w4 // 4))
+        self.out5 = out(w5 // 2, w5)
+        self.aux_out5 = out(w5 // 2, w5 + (w5 // 4))
+        self.out6 = out(w6 // 2, w6)
+        self.aux_out6 = out(w6 // 2, w6 + (w6 // 4))
+
+        self.detect3 = detect(range(0, anchors_per_cell), 5.0)
+        self.aux_detect3 = detect(range(0, anchors_per_cell), 3.0)
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2), 5.0)
+        self.aux_detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2), 3.0)
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3), 5.0)
+        self.aux_detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3), 3.0)
+        self.detect6 = detect(range(anchors_per_cell * 3, anchors_per_cell * 4), 5.0)
+        self.aux_detect6 = detect(range(anchors_per_cell * 3, anchors_per_cell * 4), 3.0)
 
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        detections = []  # Outputs from detection layers
+        losses = []  # Losses from detection layers
+        hits = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5, x = self.backbone(x)[-4:]
+        c6 = self.spp(x)
+
+        x = torch.cat((self.upsample6(c6), self.pre5(c5)), dim=1)
+        p5 = self.fpn5(x)
+        x = torch.cat((self.upsample5(p5), self.pre4(c4)), dim=1)
+        p4 = self.fpn4(x)
+        x = torch.cat((self.upsample4(p4), self.pre3(c3)), dim=1)
+        n3 = self.fpn3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.pan4(x)
+        x = torch.cat((self.downsample4(n4), p5), dim=1)
+        n5 = self.pan5(x)
+        x = torch.cat((self.downsample5(n5), c6), dim=1)
+        n6 = self.pan6(x)
+
+        run_detection_with_aux_head(
+            self.detect3,
+            self.aux_detect3,
+            self.out3(n3),
+            self.aux_out3(n3),
+            targets,
+            image_size,
+            self.aux_weight,
+            detections,
+            losses,
+            hits,
+        )
+        run_detection_with_aux_head(
+            self.detect4,
+            self.aux_detect4,
+            self.out4(n4),
+            self.aux_out4(p4),
+            targets,
+            image_size,
+            self.aux_weight,
+            detections,
+            losses,
+            hits,
+        )
+        run_detection_with_aux_head(
+            self.detect5,
+            self.aux_detect5,
+            self.out5(n5),
+            self.aux_out5(p5),
+            targets,
+            image_size,
+            self.aux_weight,
+            detections,
+            losses,
+            hits,
+        )
+        run_detection_with_aux_head(
+            self.detect6,
+            self.aux_detect6,
+            self.out6(n6),
+            self.aux_out6(c6),
+            targets,
+            image_size,
+            self.aux_weight,
+            detections,
+            losses,
+            hits,
+        )
         return detections, losses, hits
 
 
@@ -1179,7 +1603,7 @@ def __init__(
                 raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
 
         def spp(in_channels: int, out_channels: int) -> nn.Module:
-            return FastSPP(in_channels, out_channels, kernel_size=5, activation=activation, norm=normalization)
+            return FastSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
             return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
@@ -1267,22 +1691,7 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         x = torch.cat((self.downsample4(n4), p5), dim=1)
         n5 = self.pan5(x)
 
-        y = self.detect3(self.out3(n3), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect3.losses)
-            hits.append(self.detect3.hits)
-
-        y = self.detect4(self.out4(n4), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect4.losses)
-            hits.append(self.detect4.hits)
-
-        y = self.detect5(self.out5(n5), image_size, targets)
-        detections.append(y)
-        if targets is not None:
-            losses.append(self.detect5.losses)
-            hits.append(self.detect5.hits)
-
+        run_detection(self.detect3, self.out3(n3), targets, image_size, detections, losses, hits)
+        run_detection(self.detect4, self.out4(n4), targets, image_size, detections, losses, hits)
+        run_detection(self.detect5, self.out5(n5), targets, image_size, detections, losses, hits)
         return detections, losses, hits
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 543e48e7bb..350de959e3 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -77,12 +77,14 @@ def validate_batch(batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS
 
 class YOLO(LightningModule):
     """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4,
-    YOLOv5, Scaled-YOLOv4, and YOLOX.
+    YOLOv5, YOLOv7, Scaled-YOLOv4, and YOLOX.
 
     *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`_
 
     *YOLOv4 paper*: `Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2004.10934>`_
 
+    *YOLOv7 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2207.02696>`_
+
     *Scaled-YOLOv4 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao
     <https://arxiv.org/abs/2011.08036>`_
 

From 3e3bad5e36cf891654ed434be2fce1d7f9836980 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Fri, 24 Feb 2023 18:33:28 +0200
Subject: [PATCH 58/76] Fixed a too long line

---
 pl_bolts/models/detection/yolo/torch_networks.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index ea81fee2cb..3a659c8fae 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -60,7 +60,9 @@ def run_detection_with_aux_head(
 
         # Match lead head predictions to targets and calculate losses from auxiliary head outputs.
         _, aux_preds = aux_detection_layer(aux_input, image_size)
-        layer_losses, layer_hits = aux_detection_layer.calculate_losses(preds, targets, image_size, loss_preds=aux_preds)
+        layer_losses, layer_hits = aux_detection_layer.calculate_losses(
+            preds, targets, image_size, loss_preds=aux_preds
+        )
         losses.append(layer_losses * aux_weight)
         hits.append(layer_hits)
 

From d9d64eae1cb9f03d86adba2571ad60c3f53f318a Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Fri, 24 Feb 2023 20:37:04 +0200
Subject: [PATCH 59/76] Avoid using "input" as a variable name

---
 .../models/detection/yolo/torch_networks.py   | 33 +++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index 3a659c8fae..ab8e7e354b 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -12,18 +12,27 @@
 
 def run_detection(
     detection_layer: DetectionLayer,
-    input: Tensor,
+    layer_input: Tensor,
     targets: List[Dict[str, Tensor]],
     image_size: Tensor,
     detections: List[Tensor],
     losses: List[Tensor],
     hits: List[int],
-):
+) -> None:
     """Runs the detection layer on the inputs and appends the output to the ``detections`` list.
 
     If ``targets`` is given, also calculates the losses and appends to the ``losses`` list.
+
+    Args:
+        detection_layer: The detection layer.
+        layer_input: Input to the detection layer.
+        targets: List of training targets for each image.
+        image_size: Width and height in a vector that defines the scale of the target coordinates.
+        detections: A list where a tensor containing the detections will be appended to.
+        losses: A list where a tensor containing the losses will be appended to, if ``targets`` is given.
+        hits: A list where the number of targets that matched this layer will be appended to, if ``targets`` is given.
     """
-    output, preds = detection_layer(input, image_size)
+    output, preds = detection_layer(layer_input, image_size)
     detections.append(output)
 
     if targets is not None:
@@ -35,7 +44,7 @@ def run_detection(
 def run_detection_with_aux_head(
     detection_layer: DetectionLayer,
     aux_detection_layer: DetectionLayer,
-    input: Tensor,
+    layer_input: Tensor,
     aux_input: Tensor,
     targets: List[Dict[str, Tensor]],
     image_size: Tensor,
@@ -43,13 +52,25 @@ def run_detection_with_aux_head(
     detections: List[Tensor],
     losses: List[Tensor],
     hits: List[int],
-):
+) -> None:
     """Runs the detection layer on the inputs and appends the output to the ``detections`` list.
 
     If ``targets`` is given, also runs the auxiliary detection layer on the auxiliary inputs, calculates the losses, and
     appends the losses to the ``losses`` list.
+
+    Args:
+        detection_layer: The lead detection layer.
+        aux_detection_layer: The auxiliary detection layer.
+        layer_input: Input to the lead detection layer.
+        aux_input: Input to the auxiliary detection layer.
+        targets: List of training targets for each image.
+        image_size: Width and height in a vector that defines the scale of the target coordinates.
+        aux_weight: Weight of the auxiliary loss.
+        detections: A list where a tensor containing the detections will be appended to.
+        losses: A list where a tensor containing the losses will be appended to, if ``targets`` is given.
+        hits: A list where the number of targets that matched this layer will be appended to, if ``targets`` is given.
     """
-    output, preds = detection_layer(input, image_size)
+    output, preds = detection_layer(layer_input, image_size)
     detections.append(output)
 
     if targets is not None:

From 49d970994dfffaedef65ced9d39cf525660e953d Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Fri, 24 Feb 2023 22:36:09 +0200
Subject: [PATCH 60/76] Fixed type annotations

---
 pl_bolts/models/detection/yolo/layers.py      |  2 +-
 .../models/detection/yolo/target_matching.py  |  6 +--
 .../models/detection/yolo/torch_networks.py   | 42 +++++++++----------
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index 9abce36e22..9f9a4edb6e 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -371,7 +371,7 @@ class ReOrg(nn.Module):
     The result is a tensor with half the width and height, and four times as many channels.
     """
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         tl = x[..., ::2, ::2]
         bl = x[..., 1::2, ::2]
         tr = x[..., ::2, 1::2]
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index d9b584eaa4..0d6642521a 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, Sequence, Tuple, Union
+from typing import Dict, List, Sequence, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -39,7 +39,7 @@ def __call__(
         preds: Dict[str, Tensor],
         targets: Dict[str, Tensor],
         image_size: Tensor,
-    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
+    ) -> Tuple[List[Tensor], Tensor, Tensor]:
         """For each target, selects predictions from the same grid cell, where the center of the target box is.
 
         Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
@@ -264,7 +264,7 @@ def __call__(
         preds: Dict[str, Tensor],
         targets: Dict[str, Tensor],
         image_size: Tensor,
-    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
+    ) -> Tuple[Tensor, Tensor, Tensor]:
         """For each target, selects predictions using the SimOTA matching rule.
 
         Args:
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index ab8e7e354b..e6fe3f2296 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -13,7 +13,7 @@
 def run_detection(
     detection_layer: DetectionLayer,
     layer_input: Tensor,
-    targets: List[Dict[str, Tensor]],
+    targets: Optional[List[Dict[str, Tensor]]],
     image_size: Tensor,
     detections: List[Tensor],
     losses: List[Tensor],
@@ -46,7 +46,7 @@ def run_detection_with_aux_head(
     aux_detection_layer: DetectionLayer,
     layer_input: Tensor,
     aux_input: Tensor,
-    targets: List[Dict[str, Tensor]],
+    targets: Optional[List[Dict[str, Tensor]]],
     image_size: Tensor,
     aux_weight: float,
     detections: List[Tensor],
@@ -318,7 +318,7 @@ def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module
         )
         self.mix2 = Conv(2 * out_channels, out_channels)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         x1 = self.conv1(x)
         x2 = self.maxpool1(x1)
         x3 = self.maxpool2(x1)
@@ -727,9 +727,9 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
         self.detect5 = detect([6, 7, 8])
 
     def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
-        detections = []  # Outputs from detection layers
-        losses = []  # Losses from detection layers
-        hits = []  # Number of targets each detection layer was responsible for
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
 
         image_size = get_image_size(x)
 
@@ -884,9 +884,9 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
         self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
 
     def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
-        detections = []  # Outputs from detection layers
-        losses = []  # Losses from detection layers
-        hits = []  # Number of targets each detection layer was responsible for
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
 
         image_size = get_image_size(x)
 
@@ -1064,9 +1064,9 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
         self.detect6 = detect(range(anchors_per_cell * 3, anchors_per_cell * 4))
 
     def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
-        detections = []  # Outputs from detection layers
-        losses = []  # Losses from detection layers
-        hits = []  # Number of targets each detection layer was responsible for
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
 
         image_size = get_image_size(x)
 
@@ -1230,9 +1230,9 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
         self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
 
     def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
-        detections = []  # Outputs from detection layers
-        losses = []  # Losses from detection layers
-        hits = []  # Number of targets each detection layer was responsible for
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
 
         image_size = get_image_size(x)
 
@@ -1432,9 +1432,9 @@ def detect(prior_shape_idxs: Sequence[int], range: float) -> DetectionLayer:
         self.aux_detect6 = detect(range(anchors_per_cell * 3, anchors_per_cell * 4), 3.0)
 
     def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
-        detections = []  # Outputs from detection layers
-        losses = []  # Losses from detection layers
-        hits = []  # Number of targets each detection layer was responsible for
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
 
         image_size = get_image_size(x)
 
@@ -1694,9 +1694,9 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionLayer:
         self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
 
     def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
-        detections = []  # Outputs from detection layers
-        losses = []  # Losses from detection layers
-        hits = []  # Number of targets each detection layer was responsible for
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
 
         image_size = get_image_size(x)
 

From 8e4afc97e97ab9a9fbd4eee181466974dd642004 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 16 Mar 2023 14:34:05 +0200
Subject: [PATCH 61/76] SimOTA uses also size ratio for "center prior"
 filtering

---
 pl_bolts/models/detection/yolo/README.md      |   7 +-
 .../models/detection/yolo/darknet_network.py  |  68 ++++---
 pl_bolts/models/detection/yolo/layers.py      |   9 +-
 pl_bolts/models/detection/yolo/loss.py        |  32 +--
 .../models/detection/yolo/target_matching.py  | 191 ++++++++++++------
 .../models/detection/yolo/torch_networks.py   |  32 ++-
 pl_bolts/models/detection/yolo/utils.py       |  30 ++-
 pl_bolts/models/detection/yolo/yolo_module.py |  52 +++--
 .../models/yolo/unit/test_darknet_network.py  |   4 -
 .../models/yolo/unit/test_target_matching.py  |  21 +-
 tests/models/yolo/unit/test_utils.py          |  10 +-
 11 files changed, 292 insertions(+), 164 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/README.md b/pl_bolts/models/detection/yolo/README.md
index 67df445502..8f67febf0b 100644
--- a/pl_bolts/models/detection/yolo/README.md
+++ b/pl_bolts/models/detection/yolo/README.md
@@ -6,6 +6,7 @@ This PyTorch Lightning implementation combines features from some of the notable
 
 - *YOLOv3*: [Joseph Redmon and Ali Farhadi](https://arxiv.org/abs/1804.02767)
 - *YOLOv4*: [Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao](https://arxiv.org/abs/2004.10934)
+- *YOLOv7*: [Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao](https://arxiv.org/abs/2207.02696)
 - *Scaled-YOLOv4*: [Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao](https://arxiv.org/abs/2011.08036)
 - *YOLOX*: [Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun](https://arxiv.org/abs/2107.08430)
 
@@ -21,14 +22,14 @@ There are several network architectures included in the [`torch_networks`](https
 
 A detection head can try to detect objects at each of the anchor points that are spaced evenly across the image in a grid. The size of the grid is determined by the width and height of the feature map. There can be a number of anchors (typically three) per grid cell. The number of features predicted per grid cell has to be `(5 + num_classes) * anchors_per_cell`.
 
-The width and the height of a bounding box is detected relative to a prior shape. `anchors_per_cell` prior shapes per detection head are defined in the network configuration. That is, if the network uses three detection heads, and each head detects three bounding boxes per grid cell, nine prior shapes need to be defined. They are defined in the Darknet configuration file or provided to the network class constructor. The defaults values have been obtained by clustering bounding box shapes in the COCO dataset. Note that if you use a different image size, you probably want to scale the prior shapes too.
+The width and the height of a bounding box is detected relative to a prior shape. `anchors_per_cell` prior shapes per detection head are defined in the network configuration. That is, if the network uses three detection heads, and each head detects three bounding boxes per grid cell, nine prior shapes need to be defined. They are defined in the Darknet configuration file or provided to the network class constructor. The default values have been obtained by clustering bounding box shapes in the COCO dataset. Note that if you use a different image size, you probably want to scale the prior shapes too.
 
-With the exception of the SimOTA matching algorithm, the prior shapes are also used for matching the ground-truth targets to anchors during training. In this case targets are matched only to anchors from the closest grid cell. The prior shapes are used to determine, to which anchors from that cell the target is matched. The losses are computed between the target boxes and the predictions that correspond to their matched anchors. Different matching rules have been implemented:
+The prior shapes are also used for matching the ground-truth targets to anchors during training. With the exception of the SimOTA matching algorithm, targets are matched only to anchors from the closest grid cell. The prior shapes are used to determine, to which anchors from that cell the target is matched. The losses are computed between the targets boxes and the predictions that correspond to their matched anchors. Different matching rules have been implemented:
 
 - *maxiou*: The original matching rule that matches a target to the prior shape that gives the highest IoU.
 - *iou*: Matches a target to an anchor, if the IoU between the target and the prior shape is above a threshold. Multiple anchors may be matched to the same target, and the loss will be computed from a number of pairs that is generally not the same as the number of ground-truth boxes.
 - *size*: Calculates the ratio between the width and height of the target box to the prior width and height. If both the width and the height are close enough to the prior shape, matches the target to the anchor.
-- *simota*: The SimOTA matching algorithm from YOLOX. Targets can be matched not only to anchors from the closest grid cell, but to any anchors that are inside the target bounding box. The matching algorithm is based on Optimal Transport and uses the training loss between the target and the predictions as the cost. That is, the prior shapes are not used for matching, but the predictions corresponding to the anchors.
+- *simota*: The SimOTA matching algorithm from YOLOX. Targets can be matched not only to anchors from the closest grid cell, but to any anchors that are inside the target bounding box and whose prior shape is close enough to the target shape. The matching algorithm is based on Optimal Transport and uses the training loss between the target and the predictions as the cost. That is, the prior shapes are not used for matching, but the predictions corresponding to the anchors.
 
 ## Input Data
 
diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index aff197b36c..7fa775775a 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -26,39 +26,41 @@
 
 
 class DarknetNetwork(nn.Module):
-    """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation."""
+    """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation.
+
+    Iterates through the layers from the configuration and creates corresponding PyTorch modules. If ``weights_path`` is
+    given and points to a Darknet model file, loads the convolutional layer weights from the file.
+
+    Args:
+        config_path: Path to a Darknet configuration file that defines the network architecture.
+        weights_path: Path to a Darknet model file. If given, the model weights will be read from this file.
+        in_channels: Number of channels in the input image.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+        overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
+            function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
+            "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
+            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+    """
 
     def __init__(
         self, config_path: str, weights_path: Optional[str] = None, in_channels: Optional[int] = None, **kwargs: Any
     ) -> None:
-        """Parses a Darknet configuration file and creates the network structure.
-
-        Iterates through the layers from the configuration and creates corresponding PyTorch modules. If
-        ``weights_path`` is given and points to a Darknet model file, loads the convolutional layer weights from the
-        file.
-
-        Args:
-            config_path: Path to a Darknet configuration file that defines the network architecture.
-            weights_path: Path to a Darknet model file. If given, the model weights will be read from this file.
-            in_channels: Number of channels in the input image.
-            matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching
-                rule from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is
-                below given ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the
-                prior shape that gives the highest IoU, default).
-            matching_threshold: Threshold for "size" and "iou" matching algorithms.
-            ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding
-                anchor has IoU with some target greater than this threshold, the predictor will not be taken into
-                account when calculating the confidence loss.
-            overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or
-                a function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou",
-                and "ciou".
-            predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-                confidence is one if there's an object, and 1.0 means that the target confidence is the output of
-                ``overlap_func``.
-            overlap_loss_multiplier: Overlap loss will be scaled by this value.
-            confidence_loss_multiplier: Confidence loss will be scaled by this value.
-            class_loss_multiplier: Classification loss will be scaled by this value.
-        """
         super().__init__()
 
         with open(config_path) as config_file:
@@ -405,6 +407,8 @@ def _create_yolo(
     prior_shapes: Optional[List[Tuple[int, int]]] = None,
     matching_algorithm: Optional[str] = None,
     matching_threshold: Optional[float] = None,
+    spatial_range: float = 5.0,
+    size_range: float = 4.0,
     ignore_bg_threshold: Optional[float] = None,
     overlap_func: Optional[Union[str, Callable]] = None,
     predict_overlap: float = 1.0,
@@ -428,6 +432,10 @@ def _create_yolo(
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
@@ -471,6 +479,8 @@ def _create_yolo(
         prior_shape_idxs=config["mask"],
         matching_algorithm=matching_algorithm,
         matching_threshold=matching_threshold,
+        spatial_range=spatial_range,
+        size_range=size_range,
         ignore_bg_threshold=ignore_bg_threshold,
         overlap_func=overlap_func,
         predict_overlap=predict_overlap,
diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index 9f9a4edb6e..8b87fe0229 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -424,7 +424,8 @@ def create_detection_layer(
     prior_shape_idxs: Sequence[int],
     matching_algorithm: Optional[str] = None,
     matching_threshold: Optional[float] = None,
-    sim_ota_range: float = 5.0,
+    spatial_range: float = 5.0,
+    size_range: float = 4.0,
     ignore_bg_threshold: float = 0.7,
     overlap_func: Union[str, Callable] = "ciou",
     predict_overlap: float = 1.0,
@@ -446,8 +447,10 @@ def create_detection_layer(
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
-        sim_ota_range: The "simota" matching algorithm will restrict to the anchors that are within an `N x N` grid cell
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
             area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
@@ -473,7 +476,7 @@ def create_detection_layer(
         loss_func = LossFunction(
             overlap_func, None, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
         )
-        matching_func = SimOTAMatching(loss_func, sim_ota_range)
+        matching_func = SimOTAMatching(prior_shapes, prior_shape_idxs, loss_func, spatial_range, size_range)
     elif matching_algorithm == "size":
         if matching_threshold is None:
             raise ValueError("matching_threshold is required with size ratio matching.")
diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index fea1a916ea..ba53653994 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -116,27 +116,27 @@ def _pairwise_confidence_loss(
 
     Args:
         preds: An ``[N]`` vector of predicted confidences.
-        overlap: An ``[M, N]`` matrix of overlaps between all target and predicted bounding boxes.
+        overlap: An ``[N, M]`` matrix of overlaps between all predicted and target bounding boxes.
         bce_func: A function for calculating binary cross entropy.
         predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
             confidence is one if there's an object, and 1.0 means that the target confidence is the overlap.
 
     Returns:
-        An ``[M, N]`` matrix of confidence losses between all targets and predictions.
+        An ``[N, M]`` matrix of confidence losses between all predictions and targets.
     """
     if predict_overlap is not None:
         # When predicting overlap, target confidence is different for each pair of a prediction and a target. The
-        # tensors have to be broadcasted to [M, N].
-        preds = preds.unsqueeze(0).expand(overlap.shape)
+        # tensors have to be broadcasted to [N, M].
+        preds = preds.unsqueeze(1).expand(overlap.shape)
         targets = torch.ones_like(preds) - predict_overlap
         # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
         targets += predict_overlap * overlap.detach().clamp(min=0)
         return bce_func(preds, targets, reduction="none")
     else:
-        # When not predicting overlap, target confidence is the same for every target, but we should still return a
+        # When not predicting overlap, target confidence is the same for every prediction, but we should still return a
         # matrix.
         targets = torch.ones_like(preds)
-        return bce_func(preds, targets, reduction="none").unsqueeze(0).expand(overlap.shape)
+        return bce_func(preds, targets, reduction="none").unsqueeze(1).expand(overlap.shape)
 
 
 def _foreground_confidence_loss(
@@ -257,28 +257,36 @@ def pairwise(
         This method is called for obtaining costs for SimOTA matching.
 
         Args:
-            preds: A dictionary of predictions, containing "boxes", "confidences", and "classprobs".
-            targets: A dictionary of training targets, containing "boxes" and "labels".
+            preds: A dictionary of predictions, containing "boxes", "confidences", and "classprobs". Each tensor
+                contains `N` rows.
+            targets: A dictionary of training targets, containing "boxes" and "labels". Each tensor contains `M` rows.
             input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
 
         Returns:
-            Loss matrices and an overlap matrix.
+            Loss matrices and an overlap matrix. Each matrix is shaped ``[N, M]``.
         """
+        loss_shape = torch.Size([len(preds["boxes"]), len(targets["boxes"])])
+
         if input_is_normalized:
             bce_func = binary_cross_entropy
         else:
             bce_func = binary_cross_entropy_with_logits
 
-        overlap = self._pairwise_overlap(targets["boxes"], preds["boxes"])
+        overlap = self._pairwise_overlap(preds["boxes"], targets["boxes"])
+        assert overlap.shape == loss_shape
+
         overlap_loss = 1.0 - overlap
+        assert overlap_loss.shape == loss_shape
 
         confidence_loss = _pairwise_confidence_loss(preds["confidences"], overlap, bce_func, self.predict_overlap)
+        assert confidence_loss.shape == loss_shape
 
-        pred_probs = preds["classprobs"].unsqueeze(0)  # [1, preds, classes]
+        pred_probs = preds["classprobs"].unsqueeze(1)  # [N, 1, classes]
         target_probs = _target_labels_to_probs(targets["labels"], pred_probs.shape[-1], pred_probs.dtype)
-        target_probs = target_probs.unsqueeze(1)  # [targets, 1, classes]
+        target_probs = target_probs.unsqueeze(0)  # [1, M, classes]
         pred_probs, target_probs = torch.broadcast_tensors(pred_probs, target_probs)
         class_loss = bce_func(pred_probs, target_probs, reduction="none").sum(-1)
+        assert class_loss.shape == loss_shape
 
         losses = Losses(
             overlap_loss * self.overlap_multiplier,
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index 0d6642521a..b566d6cdee 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -15,6 +15,28 @@
     warn_missing_pkg("torchvision")
 
 
+def _compare_box_sizes(wh1: Tensor, wh2: Tensor, threshold: float) -> Tensor:
+    """Compares the dimensions of the boxes pairwise and returns a mask that indicates which pairs have similar
+    sizes.
+
+    For each pair of boxes, calculates the largest ratio that can be obtained by dividing the widths with each other or
+    dividing the heights with each other. Returns a mask that indicates which pairs have a ratio less than the given
+    threshold.
+
+    Args:
+        wh1: An ``[N, 2]`` matrix of box shapes (width and height).
+        wh2: An ``[M, 2]`` matrix of box shapes (width and height).
+        threshold: A threshold for the size ratio.
+
+    Returns:
+        An ``[N, M]`` matrix of truth values indicating which box pairs have the maximum size ratio below the threshold.
+    """
+    wh_ratio = wh1[:, None, :] / wh2[None, :, :]  # [M, N, 2]
+    wh_ratio = torch.max(wh_ratio, 1.0 / wh_ratio)
+    wh_ratio = wh_ratio.max(2).values  # [M, N]
+    return wh_ratio < threshold
+
+
 class ShapeMatching(ABC):
     """Selects which anchors are used to predict each target, by comparing the shape of the target box to a set of
     prior shapes.
@@ -196,12 +218,7 @@ def __init__(
 
     def match(self, wh: Tensor) -> Union[Tuple[Tensor, Tensor], Tensor]:
         prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
-
-        wh_ratio = wh[:, None, :] / prior_wh[None, :, :]  # [num_targets, num_anchors, 2]
-        wh_ratio = torch.max(wh_ratio, 1.0 / wh_ratio)
-        wh_ratio = wh_ratio.max(2).values  # [num_targets, num_anchors]
-        below_threshold = (wh_ratio < self.threshold).nonzero()
-        return below_threshold.T
+        return _compare_box_sizes(wh, prior_wh, self.threshold).nonzero().T
 
 
 def _sim_ota_match(costs: Tensor, ious: Tensor) -> Tuple[Tensor, Tensor]:
@@ -212,35 +229,38 @@ def _sim_ota_match(costs: Tensor, ious: Tensor) -> Tuple[Tensor, Tensor]:
     predicted boxes.
 
     Args:
-        costs: Sum of losses for (prediction, target) pairs: ``[targets, predictions]``
-        ious: IoUs for (prediction, target) pairs: ``[targets, predictions]``
+        costs: A ``[predictions, targets]`` matrix of losses.
+        ious: A ``[predictions, targets]`` matrix of IoUs.
 
     Returns:
         A mask of predictions that were matched, and the indices of the matched targets. The latter contains as many
         elements as there are ``True`` values in the mask.
     """
+    num_preds, num_targets = ious.shape
+
     matching_matrix = torch.zeros_like(costs, dtype=torch.bool)
 
     if ious.numel() > 0:
         # For each target, define k as the sum of the 10 highest IoUs.
-        top10_iou = torch.topk(ious, min(10, ious.shape[1])).values.sum(1)
+        top10_iou = torch.topk(ious, min(10, num_preds), dim=0).values.sum(0)
         ks = torch.clip(top10_iou.int(), min=1)
+        assert len(ks) == num_targets
 
-        # For each target, select k predictions with lowest cost.
-        for target_idx, (cost, k) in enumerate(zip(costs, ks)):
-            prediction_idx = torch.topk(cost, k, largest=False).indices
-            matching_matrix[target_idx, prediction_idx] = True
+        # For each target, select k predictions with the lowest cost.
+        for target_idx, (target_costs, k) in enumerate(zip(costs.T, ks)):
+            pred_idx = torch.topk(target_costs, k, largest=False).indices
+            matching_matrix[pred_idx, target_idx] = True
 
         # If there's more than one match for some prediction, match it with the best target. Now we consider all
         # targets, regardless of whether they were originally matched with the prediction or not.
-        more_than_one_match = matching_matrix.sum(0) > 1
-        best_targets = costs[:, more_than_one_match].argmin(0)
-        matching_matrix[:, more_than_one_match] = False
-        matching_matrix[best_targets, more_than_one_match] = True
+        more_than_one_match = matching_matrix.sum(1) > 1
+        best_targets = costs[more_than_one_match, :].argmin(1)
+        matching_matrix[more_than_one_match, :] = False
+        matching_matrix[more_than_one_match, best_targets] = True
 
     # For those predictions that were matched, get the index of the target.
-    pred_mask = matching_matrix.sum(0) > 0
-    target_selector = matching_matrix[:, pred_mask].int().argmax(0)
+    pred_mask = matching_matrix.sum(1) > 0
+    target_selector = matching_matrix[pred_mask, :].int().argmax(1)
     return pred_mask, target_selector
 
 
@@ -250,14 +270,29 @@ class SimOTAMatching:
     This is the matching rule used by YOLOX.
 
     Args:
+        prior_shapes: A list of all the prior box dimensions. The list should contain (width, height) tuples in the
+            network input resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
         loss_func: A ``LossFunction`` object that can be used to calculate the pairwise costs.
-        range: For each target, restrict to the anchors that are within an `N x N` grid cell are centered at the target,
-            where `N` is the value of this parameter.
+        spatial_range: For each target, restrict to the anchors that are within an `N × N` grid cell are centered at the
+            target, where `N` is the value of this parameter.
+        size_range: For each target, restrict to the anchors whose prior dimensions are not larger than the target
+            dimensions multiplied by this value and not smaller than the target dimensions divided by this value.
     """
 
-    def __init__(self, loss_func: LossFunction, range: float = 5.0) -> None:
+    def __init__(
+        self,
+        prior_shapes: Sequence[Tuple[int, int]],
+        prior_shape_idxs: Sequence[int],
+        loss_func: LossFunction,
+        spatial_range: float,
+        size_range: float,
+    ) -> None:
+        self.prior_shapes = [prior_shapes[idx] for idx in prior_shape_idxs]
         self.loss_func = loss_func
-        self.range = range
+        self.spatial_range = spatial_range
+        self.size_range = size_range
 
     def __call__(
         self,
@@ -276,47 +311,89 @@ def __call__(
             A mask of predictions that were matched, background mask (inverse of the first mask), and the indices of the
             matched targets. The last tensor contains as many elements as there are ``True`` values in the first mask.
         """
-        height, width, boxes_per_cell, num_classes = preds["classprobs"].shape
-        device = preds["boxes"].device
+        height, width, boxes_per_cell, _ = preds["boxes"].shape
+        prior_mask, anchor_inside_target = self._get_prior_mask(targets, image_size, width, height, boxes_per_cell)
+        prior_preds = {
+            "boxes": preds["boxes"][prior_mask],
+            "confidences": preds["confidences"][prior_mask],
+            "classprobs": preds["classprobs"][prior_mask],
+        }
+
+        losses, ious = self.loss_func.pairwise(prior_preds, targets, input_is_normalized=False)
+        costs = losses.overlap + losses.confidence + losses.classification
+        costs += 100000.0 * ~anchor_inside_target
+        pred_mask, target_selector = _sim_ota_match(costs, ious)
+
+        # Add the anchor dimension to the mask and replace True values with the results of the actual SimOTA matching.
+        prior_mask[prior_mask.nonzero().T.tolist()] = pred_mask
+
+        background_mask = torch.logical_not(prior_mask)
+
+        return prior_mask, background_mask, target_selector
+
+    def _get_prior_mask(
+        self,
+        targets: Dict[str, Tensor],
+        image_size: Tensor,
+        grid_width: int,
+        grid_height: int,
+        boxes_per_cell: int,
+    ) -> Tuple[Tensor, Tensor]:
+        """Creates a mask for selecting the "center prior" anchors.
 
+        In the first step we restrict ourselves to the grid cells whose center is inside or close enough to one or more
+        targets.
+
+        Args:
+            targets: Training targets for a single image.
+            image_size: Input image width and height.
+            grid_width: Width of the feature grid.
+            grid_height: Height of the feature grid.
+            boxes_per_cell: Number of boxes that will be predicted per feature grid cell.
+
+        Returns:
+            Two masks, a ``[grid_height, grid_width, boxes_per_cell]`` mask for selecting anchors that are close and
+            similar in shape to a target, and an ``[anchors, targets]`` matrix that indicates which targets are inside
+            those anchors.
+        """
         # A multiplier for scaling feature map coordinates to image coordinates
-        grid_size = torch.tensor([width, height], device=device)
+        grid_size = torch.tensor([grid_width, grid_height], device=targets["boxes"].device)
         grid_to_image = torch.true_divide(image_size, grid_size)
 
-        # Create a matrix for selecting the anchors that are inside the target bounding boxes.
-        centers = grid_centers(grid_size).view(-1, 2) * grid_to_image
-        inside_matrix = is_inside_box(centers, targets["boxes"])
-
-        # Set the width and height of all target bounding boxes to self.range grid cells and create a matrix for
-        # selecting the anchors that are now inside the boxes. If a small target has no anchors inside its bounding
-        # box, it will be matched to one of these anchors, but a high penalty will ensure that anchors that are inside
-        # the bounding box will be preferred.
+        # Get target center coordinates and dimensions.
         xywh = box_convert(targets["boxes"], in_fmt="xyxy", out_fmt="cxcywh")
         xy = xywh[:, :2]
-        wh = self.range * grid_to_image * torch.ones_like(xy)
-        xywh = torch.cat((xy, wh), -1)
-        boxes = box_convert(xywh, in_fmt="cxcywh", out_fmt="xyxy")
-        close_matrix = is_inside_box(centers, boxes)
-
-        # In the first step we restrict ourselves to the grid cells whose center is inside or close enough to one or
-        # more targets. The prediction grids are flattened and masked using a [height * width] boolean vector.
-        mask = (inside_matrix | close_matrix).sum(0) > 0
-        shape = (height * width, boxes_per_cell)
-        fg_preds = {
-            "boxes": preds["boxes"].view(*shape, 4)[mask].view(-1, 4),
-            "confidences": preds["confidences"].view(shape)[mask].view(-1),
-            "classprobs": preds["classprobs"].view(*shape, num_classes)[mask].view(-1, num_classes),
-        }
+        wh = xywh[:, 2:]
 
-        losses, ious = self.loss_func.pairwise(fg_preds, targets, input_is_normalized=False)
-        costs = losses.overlap + losses.confidence + losses.classification
-        costs += 100000.0 * ~inside_matrix[:, mask].repeat_interleave(boxes_per_cell, 1)
-        pred_mask, target_selector = _sim_ota_match(costs, ious)
+        # Create a [boxes_per_cell, targets] tensor for selecting prior shapes that are close enough to the target
+        # dimensions.
+        prior_wh = torch.tensor(self.prior_shapes, device=targets["boxes"].device)  # XXX Enable size filtering.
+        shape_selector = _compare_box_sizes(prior_wh, wh, self.size_range)  # XXX Enable size filtering.
 
-        # Add the anchor dimension to the mask and replace True values with the results of the actual SimOTA matching.
-        mask = mask.view(height, width).unsqueeze(-1).repeat(1, 1, boxes_per_cell)
-        mask[mask.nonzero().T.tolist()] = pred_mask
+        # Create a [grid_cells, targets] tensor for selecting spatial locations that are inside target bounding boxes.
+        centers = grid_centers(grid_size).view(-1, 2) * grid_to_image
+        inside_selector = is_inside_box(centers, targets["boxes"])
+
+        # Combine the above selectors into a [grid_cells, boxes_per_cell, targets] tensor for selecting anchors that are
+        # inside target bounding boxes and close enough shape.
+        inside_selector = inside_selector[:, None, :].repeat(1, boxes_per_cell, 1)
+        inside_selector = torch.logical_and(inside_selector, shape_selector)  # XXX Enable size filtering.
+
+        # Set the width and height of all target bounding boxes to self.range grid cells and create a selector for
+        # anchors that are now inside the boxes. If a small target has no anchors inside its bounding box, it will be
+        # matched to one of these anchors, but a high penalty will ensure that anchors that are inside the bounding box
+        # will be preferred.
+        wh = self.spatial_range * grid_to_image * torch.ones_like(xy)
+        xywh = torch.cat((xy, wh), -1)
+        boxes = box_convert(xywh, in_fmt="cxcywh", out_fmt="xyxy")
+        close_selector = is_inside_box(centers, boxes)
 
-        background_mask = torch.logical_not(mask)
+        # Create a [grid_cells, boxes_per_cell, targets] tensor for selecting anchors that are spatially close to a
+        # target and whose shape is close enough to the target.
+        close_selector = close_selector[:, None, :].repeat(1, boxes_per_cell, 1)
+        close_selector = torch.logical_and(close_selector, shape_selector)  # XXX Enable size filtering.
 
-        return mask, background_mask, target_selector
+        mask = torch.logical_or(inside_selector, close_selector).sum(-1) > 0
+        mask = mask.view(grid_height, grid_width, boxes_per_cell)
+        inside_selector = inside_selector.view(grid_height, grid_width, boxes_per_cell, -1)
+        return mask, inside_selector[mask]
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index e6fe3f2296..57abab77b0 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -637,6 +637,10 @@ class YOLOV4TinyNetwork(nn.Module):
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
@@ -767,6 +771,10 @@ class YOLOV4Network(nn.Module):
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
@@ -928,6 +936,10 @@ class YOLOV4P6Network(nn.Module):
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
@@ -1118,6 +1130,10 @@ class YOLOV5Network(nn.Module):
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
@@ -1277,6 +1293,8 @@ class YOLOV7Network(nn.Module):
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
@@ -1372,7 +1390,7 @@ def detect(prior_shape_idxs: Sequence[int], range: float) -> DetectionLayer:
             return create_detection_layer(
                 prior_shapes,
                 prior_shape_idxs,
-                sim_ota_range=range,
+                spatial_range=range,
                 num_classes=num_classes,
                 input_is_normalized=False,
                 **kwargs,
@@ -1385,10 +1403,10 @@ def detect(prior_shape_idxs: Sequence[int], range: float) -> DetectionLayer:
                 widths=widths, depth=2, block_depth=2, activation=activation, normalization=normalization
             )
 
-        w3 = widths[-4]  # 256
-        w4 = widths[-3]  # 512
-        w5 = widths[-2]  # 768
-        w6 = widths[-1]  # 1024
+        w3 = widths[-4]
+        w4 = widths[-3]
+        w5 = widths[-2]
+        w6 = widths[-1]
 
         self.spp = spp(w6, w6 // 2)
 
@@ -1587,6 +1605,10 @@ class YOLOXNetwork(nn.Module):
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
             with some target greater than this threshold, the predictor will not be taken into account when calculating
             the confidence loss.
diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
index 80b87a51f7..ee930706b0 100644
--- a/pl_bolts/models/detection/yolo/utils.py
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -76,21 +76,21 @@ def global_xy(xy: Tensor, image_size: Tensor) -> Tensor:
     return (xy + offset) * scale
 
 
-def aligned_iou(dims1: Tensor, dims2: Tensor) -> Tensor:
+def aligned_iou(wh1: Tensor, wh2: Tensor) -> Tensor:
     """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at
     the same coordinates.
 
     Args:
-        dims1: Width and height of `N` boxes. Tensor of size ``[N, 2]``.
-        dims2: Width and height of `M` boxes. Tensor of size ``[M, 2]``.
+        wh1: An ``[N, 2]`` matrix of box shapes (width and height).
+        wh2: An ``[M, 2]`` matrix of box shapes (width and height).
 
     Returns:
-        Tensor of size ``[N, M]`` containing the pairwise IoU values for every element in ``dims1`` and ``dims2``
+        An ``[N, M]`` matrix of pairwise IoU values for every element in ``wh1`` and ``wh2``
     """
-    area1 = dims1[:, 0] * dims1[:, 1]  # [N]
-    area2 = dims2[:, 0] * dims2[:, 1]  # [M]
+    area1 = wh1[:, 0] * wh1[:, 1]  # [N]
+    area2 = wh2[:, 0] * wh2[:, 1]  # [M]
 
-    inter_wh = torch.min(dims1[:, None, :], dims2)  # [N, M, 2]
+    inter_wh = torch.min(wh1[:, None, :], wh2)  # [N, M, 2]
     inter = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # [N, M]
     union = area1[:, None] + area2 - inter  # [N, M]
 
@@ -121,18 +121,16 @@ def is_inside_box(points: Tensor, boxes: Tensor) -> Tensor:
     """Get pairwise truth values of whether the point is inside the box.
 
     Args:
-        points: point (x, y) coordinates, [points, 2]
-        boxes: box (x1, y1, x2, y2) coordinates, [boxes, 4]
+        points: Point (x, y) coordinates, a tensor shaped ``[points, 2]``.
+        boxes: Box (x1, y1, x2, y2) coordinates, a tensor shaped ``[boxes, 4]``.
 
     Returns:
-        A tensor shaped ``[boxes, points]`` containing pairwise truth values of whether the points are inside the boxes.
+        A tensor shaped ``[points, boxes]`` containing pairwise truth values of whether the points are inside the boxes.
     """
-    points = points.unsqueeze(0)  # [1, points, 2]
-    boxes = boxes.unsqueeze(1)  # [boxes, 1, 4]
-    lt = points - boxes[..., :2]  # [boxes, points, 2]
-    rb = boxes[..., 2:] - points  # [boxes, points, 2]
-    deltas = torch.cat((lt, rb), -1)  # [boxes, points, 4]
-    return deltas.min(-1).values > 0.0  # [boxes, points]
+    lt = points[:, None, :] - boxes[None, :, :2]  # [boxes, points, 2]
+    rb = boxes[None, :, 2:] - points[:, None, :]  # [boxes, points, 2]
+    deltas = torch.cat((lt, rb), -1)  # [points, boxes, 4]
+    return deltas.min(-1).values > 0.0  # [points, boxes]
 
 
 @torch.jit.script
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 350de959e3..78b9e1c735 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -93,32 +93,32 @@ class YOLO(LightningModule):
     *Implementation*: `Seppo Enarvi <https://github.com/senarvi>`_
 
     The network architecture can be written in PyTorch, or read from a Darknet configuration file using the
-    :class:`~pl_bolts.models.detection.yolo.darknet_network.DarknetNetwork` class. ``DarknetNetwork`` is also able to
-    read weights from a Darknet model file. See the CLI application and the
-    :class:`~pl_bolts.models.detection.yolo.yolo_module.CLIYOLO` class for an example of how to specify a network
-    architecture.
+    :class:`~.darknet_network.DarknetNetwork` class. ``DarknetNetwork`` is also able to read weights that have been
+    saved by Darknet. See the :class:`~.yolo_module.CLIYOLO` command-line application for an example of how to specify
+    a network architecture.
 
     The input from the data loader is expected to be a list of images. Each image is a tensor with shape
     ``[channels, height, width]``. The images from a single batch will be stacked into a single tensor, so the sizes
     have to match. Different batches can have different image sizes, as long as the size is divisible by the ratio in
     which the network downsamples the input.
 
-    During training, the model expects both the image tensors and a list of targets. *Each target is a dictionary
-    containing the following tensors*:
+    During training, the model expects both the image tensors and a list of targets. It's possible to train a model
+    using one integer class label per box, but the YOLO model supports also multiple classes per box. For multi-class
+    training, simply use a boolean matrix that indicates which classes are assigned to which boxes, in place of the
+    class labels. *Each target is a dictionary containing the following tensors*:
 
     - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in `(x1, y1, x2, y2)` format
     - labels (``Int64Tensor[N]`` or ``BoolTensor[N, classes]``): the class label or a boolean class mask for each
       ground-truth box
 
-    :func:`~pl_bolts.models.detection.yolo.yolo_module.YOLO.forward` method returns all predictions from all detection
-    layers in one tensor with shape ``[N, anchors, classes + 5]``, where ``anchors`` is the total number of anchors in
-    all detection layers. The coordinates are scaled to the input image size. During training it also returns a
-    dictionary containing the classification, box overlap, and confidence losses.
+    :func:`~.yolo_module.YOLO.forward` method returns all predictions from all detection layers in one tensor with shape
+    ``[N, anchors, classes + 5]``, where ``anchors`` is the total number of anchors in all detection layers. The
+    coordinates are scaled to the input image size. During training it also returns a dictionary containing the
+    classification, box overlap, and confidence losses.
 
-    During inference, the model requires only the image tensors.
-    :func:`~pl_bolts.models.detection.yolo.yolo_module.YOLO.infer` method filters and processes the predictions. If a
-    prediction has a high score for more than one class, it will be duplicated. *The processed output is returned in a
-    dictionary containing the following tensors*:
+    During inference, the model requires only the image tensors. :func:`~.yolo_module.YOLO.infer` method filters and
+    processes the predictions. If a prediction has a high score for more than one class, it will be duplicated. *The
+    processed output is returned in a dictionary containing the following tensors*:
 
     - boxes (``FloatTensor[N, 4]``): predicted bounding box `(x1, y1, x2, y2)` coordinates in image space
     - scores (``FloatTensor[N]``): detection confidences
@@ -126,8 +126,7 @@ class YOLO(LightningModule):
 
     Args:
         network: A module that represents the network layers. This can be obtained from a Darknet configuration using
-            :func:`~pl_bolts.models.detection.yolo.darknet_network.DarknetNetwork`, or it can be defined as PyTorch
-            code.
+            :func:`~.darknet_network.DarknetNetwork`, or it can be defined as PyTorch code.
         optimizer: Which optimizer class to use for training.
         optimizer_params: Parameters to pass to the optimizer constructor. Weight decay will be applied only to
             convolutional layer weights.
@@ -189,11 +188,10 @@ def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Te
                 target dictionaries, one for each image.
 
         Returns:
-            detections (:class:`~torch.Tensor`), losses (Dict[str, :class:`~torch.Tensor`]): Detections, and if targets
-            were provided, a dictionary of losses. Detections are shaped
-            ``[batch_size, anchors, classes + 5]``, where ``anchors`` is the feature map size (width * height) times the
-            number of anchors per cell. The predicted box coordinates are in `(x1, y1, x2, y2)` format and scaled to the
-            input image size.
+            detections (:class:`~torch.Tensor`), losses (:class:`~torch.Tensor`): Detections, and if targets were
+            provided, a dictionary of losses. Detections are shaped ``[batch_size, anchors, classes + 5]``, where
+            ``anchors`` is the feature map size (width * height) times the number of anchors per cell. The predicted box
+            coordinates are in `(x1, y1, x2, y2)` format and scaled to the input image size.
         """
         detections, losses, hits = self.network(images, targets)
 
@@ -364,7 +362,7 @@ def infer(self, image: Tensor) -> Dict[str, Tensor]:
             box `(x1, y1, x2, y2)` coordinates. "scores" is a vector of confidence scores for the bounding box
             detections. "labels" is a vector of predicted class labels.
         """
-        if not isinstance(image, torch.Tensor):
+        if not isinstance(image, Tensor):
             image = F.to_tensor(image)
 
         was_training = self.training
@@ -475,6 +473,10 @@ class CLIYOLO(YOLO):
             ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
             gives the highest IoU, default).
         matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N × N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
@@ -494,6 +496,8 @@ def __init__(
         darknet_weights: Optional[str] = None,
         matching_algorithm: Optional[str] = None,
         matching_threshold: Optional[float] = None,
+        spatial_range: float = 5.0,
+        size_range: float = 4.0,
         ignore_bg_threshold: Optional[float] = None,
         overlap_func: Optional[str] = None,
         predict_overlap: Optional[float] = None,
@@ -508,6 +512,8 @@ def __init__(
                 darknet_weights,
                 matching_algorithm=matching_algorithm,
                 matching_threshold=matching_threshold,
+                spatial_range=spatial_range,
+                size_range=size_range,
                 ignore_bg_threshold=ignore_bg_threshold,
                 overlap_func=overlap_func,
                 predict_overlap=predict_overlap,
@@ -532,6 +538,8 @@ def __init__(
                 num_classes=21,  # The number of classes in Pascal VOC dataset.
                 matching_algorithm=matching_algorithm,
                 matching_threshold=matching_threshold,
+                spatial_range=spatial_range,
+                size_range=size_range,
                 ignore_bg_threshold=ignore_bg_threshold,
                 overlap_func=overlap_func,
                 predict_overlap=predict_overlap,
diff --git a/tests/models/yolo/unit/test_darknet_network.py b/tests/models/yolo/unit/test_darknet_network.py
index 0fd45a1a42..ad8aa0c9d8 100644
--- a/tests/models/yolo/unit/test_darknet_network.py
+++ b/tests/models/yolo/unit/test_darknet_network.py
@@ -75,12 +75,8 @@ def test_create_maxpool(config, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    #    print("size", config["size"])
-    #    print("stride", )
     pad_size, remainder = divmod(max(config["size"], config["stride"]) - config["stride"], 2)
-    #    print("pad_size", pad_size)
     maxpool, _ = _create_maxpool(config, [3])
-    #    print("maxpool.maxpool.padding", maxpool.maxpool.padding)
 
     assert maxpool.maxpool.kernel_size == config["size"]
     assert maxpool.maxpool.stride == config["stride"]
diff --git a/tests/models/yolo/unit/test_target_matching.py b/tests/models/yolo/unit/test_target_matching.py
index 07f00ad751..ffc0ed3617 100644
--- a/tests/models/yolo/unit/test_target_matching.py
+++ b/tests/models/yolo/unit/test_target_matching.py
@@ -4,18 +4,23 @@
 
 
 def test_sim_ota_match(catch_warnings):
-    # IoUs will determined that 2 and 1 predictions will be selected for the first and the second target.
-    ious = torch.tensor([[0.1, 0.1, 0.9, 0.9], [0.2, 0.3, 0.4, 0.1]])
+    # For each of the two targets, k will be the sum of the IoUs. 2 and 1 predictions will be selected for the first and
+    # the second target respectively.
+    ious = torch.tensor([[0.1, 0.2], [0.1, 0.3], [0.9, 0.4], [0.9, 0.1]])
     # Costs will determine that the first and the last prediction will be selected for the first target, and the first
-    # prediction will be selected for the second target. Since the first prediction was selected for both targets, it
-    # will be matched to the best target only (the second one).
-    costs = torch.tensor([[0.3, 0.5, 0.4, 0.3], [0.1, 0.2, 0.5, 0.3]])
+    # prediction will be selected for the second target. The first prediction was selected for two targets, but it will
+    # be matched to the best target only (the second one).
+    costs = torch.tensor([[0.3, 0.1], [0.5, 0.2], [0.4, 0.5], [0.3, 0.3]])
     matched_preds, matched_targets = _sim_ota_match(costs, ious)
+
+    # The first and the last prediction were matched.
     assert len(matched_preds) == 4
     assert matched_preds[0]
     assert not matched_preds[1]
     assert not matched_preds[2]
     assert matched_preds[3]
-    assert len(matched_targets) == 2  # Two predictions were matched.
-    assert matched_targets[0] == 1  # Which target was matched to the first prediction.
-    assert matched_targets[1] == 0  # Which target was matched to the last prediction.
+
+    # The first prediction was matched to the target 1 and the last prediction was matched to target 0.
+    assert len(matched_targets) == 2
+    assert matched_targets[0] == 1
+    assert matched_targets[1] == 0
diff --git a/tests/models/yolo/unit/test_utils.py b/tests/models/yolo/unit/test_utils.py
index fa4121a191..65e02e79ad 100644
--- a/tests/models/yolo/unit/test_utils.py
+++ b/tests/models/yolo/unit/test_utils.py
@@ -60,14 +60,14 @@ def test_is_inside_box(catch_warnings):
          [1,7; 3,7; 5,7; 7,7; 9,7; 11,7; 13,7; 15,7; 17,7; 19,7]
          [1,9; 3,9; 5,9; 7,9; 9,9; 11,9; 13,9; 15,9; 17,9; 19,9]]
 
-    is_inside[0]:
+    is_inside[..., 0]:
         [[F, F, F, F, F, F, F, F, F, F]
          [F, T, T, F, F, F, F, F, F, F]
          [F, T, T, F, F, F, F, F, F, F]
          [F, F, F, F, F, F, F, F, F, F]
          [F, F, F, F, F, F, F, F, F, F]]
 
-    is_inside[1]:
+    is_inside[..., 1]:
         [[F, F, F, F, F, F, F, F, F, F]
          [F, F, F, F, F, F, F, F, F, F]
          [F, F, F, F, F, F, F, F, F, F]
@@ -78,10 +78,10 @@ def test_is_inside_box(catch_warnings):
     centers = grid_centers(size) * 2.0
     centers = centers.view(-1, 2)
     boxes = torch.tensor([[2, 2, 6, 6], [14, 8, 18, 10]])
-    is_inside = is_inside_box(centers, boxes).view(2, 5, 10)
+    is_inside = is_inside_box(centers, boxes).view(5, 10, 2)
     assert torch.count_nonzero(is_inside) == 6
-    assert torch.all(is_inside[0, 1:3, 1:3])
-    assert torch.all(is_inside[1, 4, 7:9])
+    assert torch.all(is_inside[1:3, 1:3, 0])
+    assert torch.all(is_inside[4, 7:9, 1])
 
 
 @pytest.mark.parametrize(

From a9c72e34e8b8d2d2e8211d1a179c37604b761e2a Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Fri, 17 Mar 2023 15:27:45 +0200
Subject: [PATCH 62/76] Fixed docstrings

---
 pl_bolts/models/detection/yolo/yolo_module.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 78b9e1c735..04ca8191de 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -35,10 +35,10 @@
 
 
 def validate_batch(batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS]:
-    """Reads a batch of data, validates the format, and stacks the images into a single tensor.
+    """Validates the format of a batch of data and stacks the images into a single tensor.
 
     Args:
-        batch: The batch of data read by the :class:`~torch.utils.data.DataLoader`.
+        batch: A batch of data read by a :class:`~torch.utils.data.DataLoader`.
 
     Returns:
         The input batch with images stacked into a single tensor.
@@ -76,8 +76,8 @@ def validate_batch(batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS
 
 
 class YOLO(LightningModule):
-    """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4,
-    YOLOv5, YOLOv7, Scaled-YOLOv4, and YOLOX.
+    """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4, YOLOv5,
+    YOLOv7, Scaled-YOLOv4, and YOLOX.
 
     *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`_
 
@@ -182,10 +182,9 @@ def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Te
         that depends on the size of the feature map and the number of anchors per feature map cell.
 
         Args:
-            images: Images to be processed. Tensor of size
-                ``[batch_size, channels, height, width]``.
-            targets: If set, computes losses from detection layers against these targets. A list of
-                target dictionaries, one for each image.
+            images: Images to be processed. Tensor of size ``[batch_size, channels, height, width]``.
+            targets: If given, computes losses from detection layers against these targets. A list of target
+                dictionaries, one for each image.
 
         Returns:
             detections (:class:`~torch.Tensor`), losses (:class:`~torch.Tensor`): Detections, and if targets were

From cc57b6002e21590b456e609ff4962940639f19af Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Fri, 17 Mar 2023 20:11:23 +0200
Subject: [PATCH 63/76] Fixed LRScheduler import for PyTorch 2.0

---
 pl_bolts/models/detection/yolo/yolo_module.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 04ca8191de..3699f5f78a 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -8,6 +8,13 @@
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
 from torch import Tensor, optim
 
+# It seems to be impossible to avoid mypy errors if using import instead of getattr().
+# See https://github.com/python/mypy/issues/8823
+try:
+    LRScheduler: Any = getattr(optim.lr_scheduler, "LRScheduler")
+except ImportError:
+    LRScheduler = getattr(optim.lr_scheduler, "_LRScheduler")
+
 from pl_bolts.datamodules import VOCDetectionDataModule
 from pl_bolts.datamodules.vocdetection_datamodule import Compose
 from pl_bolts.models.detection.yolo.darknet_network import DarknetNetwork
@@ -144,7 +151,7 @@ def __init__(
         network: nn.Module,
         optimizer: Type[optim.Optimizer] = optim.SGD,
         optimizer_params: Optional[Dict[str, Any]] = None,
-        lr_scheduler: Type[optim.lr_scheduler._LRScheduler] = LinearWarmupCosineAnnealingLR,
+        lr_scheduler: Type[LRScheduler] = LinearWarmupCosineAnnealingLR,
         lr_scheduler_params: Optional[Dict[str, Any]] = None,
         confidence_threshold: float = 0.2,
         nms_threshold: float = 0.45,
@@ -206,7 +213,7 @@ def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Te
         losses = torch.stack(losses).sum(0)
         return detections, losses
 
-    def configure_optimizers(self) -> Tuple[List[optim.Optimizer], List[optim.lr_scheduler._LRScheduler]]:
+    def configure_optimizers(self) -> Tuple[List[optim.Optimizer], List[LRScheduler]]:
         """Constructs the optimizer and learning rate scheduler based on ``self.optimizer_params`` and
         ``self.lr_scheduler_params``.
 

From e6f6cc1b729d6f7c31c4819cbf99a00bba4290a2 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Mon, 3 Apr 2023 17:06:00 +0300
Subject: [PATCH 64/76] Added support for label smoothing

---
 pl_bolts/models/detection/yolo/README.md      |   2 +-
 .../models/detection/yolo/darknet_network.py  |  47 +++---
 pl_bolts/models/detection/yolo/layers.py      |  75 +++++-----
 pl_bolts/models/detection/yolo/loss.py        |  59 +++++---
 .../models/detection/yolo/target_matching.py  |  41 ++----
 .../models/detection/yolo/torch_networks.py   | 132 ++++++++++-------
 pl_bolts/models/detection/yolo/types.py       |   8 +-
 pl_bolts/models/detection/yolo/utils.py       |  19 +++
 pl_bolts/models/detection/yolo/yolo_module.py | 138 +++++++++---------
 9 files changed, 290 insertions(+), 231 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/README.md b/pl_bolts/models/detection/yolo/README.md
index 8f67febf0b..8794a10e5e 100644
--- a/pl_bolts/models/detection/yolo/README.md
+++ b/pl_bolts/models/detection/yolo/README.md
@@ -35,7 +35,7 @@ The prior shapes are also used for matching the ground-truth targets to anchors
 
 The model input is expected to be a list of images. Each image is a tensor with shape `[channels, height, width]`. The images from a single batch will be stacked into a single tensor, so the sizes have to match. Different batches can have different image sizes. The feature pyramid network introduces another constraint on the image size: the width and the height have to be divisible by the ratio in which the network downsamples the input.
 
-During training, the model expects both the image tensors and a list of targets. Each target is a dictionary containing the following tensors:
+During training, the model expects both the image tensors and a list of targets. It's possible to train a model using one integer class label per box, but the YOLO model supports also multiple labels per box. For multi-label training, simply use a boolean matrix that indicates which classes are assigned to which boxes, in place of the class labels. Each target is a dictionary containing the following tensors:
 
 - *boxes*: `(x1, y1, x2, y2)` coordinates of the ground-truth boxes in a matrix with shape `[N, 4]`.
 - *labels*: Either integer class labels in a vector of size `N` or a class mask for each ground-truth box in a boolean matrix with shape `[N, classes]`
diff --git a/pl_bolts/models/detection/yolo/darknet_network.py b/pl_bolts/models/detection/yolo/darknet_network.py
index 7fa775775a..7de4e81c47 100644
--- a/pl_bolts/models/detection/yolo/darknet_network.py
+++ b/pl_bolts/models/detection/yolo/darknet_network.py
@@ -9,17 +9,16 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 try:
-    from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_info
+    from pytorch_lightning.utilities.rank_zero import rank_zero_info
 except ModuleNotFoundError:
-    from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info
+    from pytorch_lightning.utilities.distributed import rank_zero_info
 
 from torch import Tensor
 
-from pl_bolts.models.detection.yolo import layers
-from pl_bolts.models.detection.yolo.layers import MaxPool
-from pl_bolts.models.detection.yolo.torch_networks import NETWORK_OUTPUT
-from pl_bolts.models.detection.yolo.types import TARGETS
-from pl_bolts.models.detection.yolo.utils import get_image_size
+from .layers import Conv, DetectionLayer, MaxPool, RouteLayer, ShortcutLayer, create_detection_layer
+from .torch_networks import NETWORK_OUTPUT
+from .types import TARGETS
+from .utils import get_image_size
 
 CONFIG = Dict[str, Any]
 CREATE_LAYER_OUTPUT = Tuple[nn.Module, int]  # layer, num_outputs
@@ -50,9 +49,11 @@ class DarknetNetwork(nn.Module):
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou".
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -100,9 +101,9 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         image_size = get_image_size(x)
 
         for layer in self.layers:
-            if isinstance(layer, (layers.RouteLayer, layers.ShortcutLayer)):
+            if isinstance(layer, (RouteLayer, ShortcutLayer)):
                 x = layer(outputs)
-            elif isinstance(layer, layers.DetectionLayer):
+            elif isinstance(layer, DetectionLayer):
                 x, preds = layer(x, image_size)
                 detections.append(x)
                 if targets is not None:
@@ -151,13 +152,11 @@ def read(tensor: Tensor) -> int:
                     tensor.copy_(source)
             return num_elements
 
-        for layer_idx, layer in enumerate(self.layers):
+        for layer in self.layers:
             # Weights are loaded only to convolutional layers
-            if not isinstance(layer, layers.Conv):
+            if not isinstance(layer, Conv):
                 continue
 
-            rank_zero_debug(f"Reading weights for layer {layer_idx}: {list(layer.conv.weight.shape)}")
-
             # If convolution is followed by batch normalization, read the batch normalization parameters. Otherwise we
             # read the convolution bias.
             if isinstance(layer.norm, nn.Identity):
@@ -310,7 +309,7 @@ def _create_convolutional(config: CONFIG, num_inputs: List[int], **kwargs: Any)
     batch_normalize = config.get("batch_normalize", False)
     padding = (config["size"] - 1) // 2 if config["pad"] else 0
 
-    layer = layers.Conv(
+    layer = Conv(
         num_inputs[-1],
         config["filters"],
         kernel_size=config["size"],
@@ -361,7 +360,7 @@ def _create_route(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CREAT
     last = len(num_inputs) - 1
     source_layers = [layer if layer >= 0 else last + layer for layer in config["layers"]]
 
-    layer = layers.RouteLayer(source_layers, num_chunks, chunk_idx)
+    layer = RouteLayer(source_layers, num_chunks, chunk_idx)
 
     # The number of outputs of a source layer is the number of inputs of the next layer.
     num_outputs = sum(num_inputs[layer + 1] // num_chunks for layer in source_layers)
@@ -382,7 +381,7 @@ def _create_shortcut(config: CONFIG, num_inputs: List[int], **kwargs: Any) -> CR
         module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
         its output.
     """
-    layer = layers.ShortcutLayer(config["from"])
+    layer = ShortcutLayer(config["from"])
     return layer, num_inputs[-1]
 
 
@@ -411,7 +410,8 @@ def _create_yolo(
     size_range: float = 4.0,
     ignore_bg_threshold: Optional[float] = None,
     overlap_func: Optional[Union[str, Callable]] = None,
-    predict_overlap: float = 1.0,
+    predict_overlap: Optional[float] = None,
+    label_smoothing: Optional[float] = None,
     overlap_loss_multiplier: Optional[float] = None,
     confidence_loss_multiplier: Optional[float] = None,
     class_loss_multiplier: Optional[float] = None,
@@ -442,9 +442,11 @@ def _create_yolo(
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou".
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -473,7 +475,7 @@ def _create_yolo(
         class_loss_multiplier = config.get("cls_normalizer", 1.0)
         assert isinstance(class_loss_multiplier, float)
 
-    layer = layers.create_detection_layer(
+    layer = create_detection_layer(
         num_classes=config["classes"],
         prior_shapes=prior_shapes,
         prior_shape_idxs=config["mask"],
@@ -484,6 +486,7 @@ def _create_yolo(
         ignore_bg_threshold=ignore_bg_threshold,
         overlap_func=overlap_func,
         predict_overlap=predict_overlap,
+        label_smoothing=label_smoothing,
         overlap_loss_multiplier=overlap_loss_multiplier,
         confidence_loss_multiplier=confidence_loss_multiplier,
         class_loss_multiplier=class_loss_multiplier,
diff --git a/pl_bolts/models/detection/yolo/layers.py b/pl_bolts/models/detection/yolo/layers.py
index 8b87fe0229..d401fa3a1b 100644
--- a/pl_bolts/models/detection/yolo/layers.py
+++ b/pl_bolts/models/detection/yolo/layers.py
@@ -1,21 +1,16 @@
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
 
 import torch
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import Tensor, nn
 
-from pl_bolts.models.detection.yolo.loss import LossFunction
-from pl_bolts.models.detection.yolo.target_matching import (
-    HighestIoUMatching,
-    IoUThresholdMatching,
-    ShapeMatching,
-    SimOTAMatching,
-    SizeRatioMatching,
-)
-from pl_bolts.models.detection.yolo.utils import global_xy
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
 
+from .loss import YOLOLoss
+from .target_matching import HighestIoUMatching, IoUThresholdMatching, ShapeMatching, SimOTAMatching, SizeRatioMatching
+from .types import PRED, PREDS, TARGET, TARGETS
+from .utils import global_xy
+
 if _TORCHVISION_AVAILABLE:
     from torchvision.ops import box_convert
 else:  # pragma: no cover
@@ -60,7 +55,7 @@ class DetectionLayer(nn.Module):
         prior_shapes: A list of prior box dimensions for this layer, used for scaling the predicted dimensions. The list
             should contain (width, height) tuples in the network input resolution.
         matching_func: The matching algorithm to be used for assigning targets to anchors.
-        loss_func: ``LossFunction`` object for calculating the losses.
+        loss_func: ``YOLOLoss`` object for calculating the losses.
         xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
             to produce coordinate values close to one.
         input_is_normalized: The input is normalized by logistic activation in the previous layer. In this case the
@@ -74,7 +69,7 @@ def __init__(
         num_classes: int,
         prior_shapes: List[Tuple[int, int]],
         matching_func: Callable,
-        loss_func: LossFunction,
+        loss_func: YOLOLoss,
         xy_scale: float = 1.0,
         input_is_normalized: bool = False,
     ) -> None:
@@ -90,7 +85,7 @@ def __init__(
         self.xy_scale = xy_scale
         self.input_is_normalized = input_is_normalized
 
-    def forward(self, x: Tensor, image_size: Tensor) -> Tuple[Tensor, List[Dict[str, Tensor]]]:
+    def forward(self, x: Tensor, image_size: Tensor) -> Tuple[Tensor, PREDS]:
         """Runs a forward pass through this YOLO detection layer.
 
         Maps cell-local coordinates to global coordinates in the image space, scales the bounding boxes with the
@@ -116,7 +111,7 @@ def forward(self, x: Tensor, image_size: Tensor) -> Tuple[Tensor, List[Dict[str,
         num_attrs = self.num_classes + 5
         anchors_per_cell = int(torch.div(num_features, num_attrs, rounding_mode="floor"))
         if anchors_per_cell != len(self.prior_shapes):
-            raise MisconfigurationException(
+            raise ValueError(
                 "The model predicts {} bounding boxes per spatial location, but {} prior box dimensions are defined "
                 "for this layer.".format(anchors_per_cell, len(self.prior_shapes))
             )
@@ -159,11 +154,11 @@ def forward(self, x: Tensor, image_size: Tensor) -> Tuple[Tensor, List[Dict[str,
 
     def match_targets(
         self,
-        preds: List[Dict[str, Tensor]],
-        return_preds: List[Dict[str, Tensor]],
-        targets: List[Dict[str, Tensor]],
+        preds: PREDS,
+        return_preds: PREDS,
+        targets: TARGETS,
         image_size: Tensor,
-    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
+    ) -> Tuple[PRED, TARGET]:
         """Matches the predictions to targets.
 
         Args:
@@ -199,16 +194,15 @@ def match_targets(
                     "labels": image_targets["labels"][target_selector],
                 }
             else:
-                device = image_preds["confidences"].device
                 matched_preds = {
-                    "boxes": torch.empty((0, 4), device=device),
-                    "confidences": torch.empty(0, device=device),
-                    "bg_confidences": image_preds["confidences"].flatten(),
-                    "classprobs": torch.empty((0, self.num_classes), device=device),
+                    "boxes": torch.empty((0, 4), device=image_return_preds["boxes"].device),
+                    "confidences": torch.empty(0, device=image_return_preds["confidences"].device),
+                    "bg_confidences": image_return_preds["confidences"].flatten(),
+                    "classprobs": torch.empty((0, self.num_classes), device=image_return_preds["classprobs"].device),
                 }
                 matched_targets = {
-                    "boxes": torch.empty((0, 4), device=device),
-                    "labels": torch.empty(0, dtype=torch.int64, device=device),
+                    "boxes": torch.empty((0, 4), device=image_targets["boxes"].device),
+                    "labels": torch.empty(0, dtype=torch.int64, device=image_targets["labels"].device),
                 }
             matches.append((matched_preds, matched_targets))
 
@@ -226,10 +220,10 @@ def match_targets(
 
     def calculate_losses(
         self,
-        preds: List[Dict[str, Tensor]],
-        targets: List[Dict[str, Tensor]],
+        preds: PREDS,
+        targets: TARGETS,
         image_size: Tensor,
-        loss_preds: Optional[List[Dict[str, Tensor]]] = None,
+        loss_preds: Optional[PREDS] = None,
     ) -> Tuple[Tensor, int]:
         """Matches the predictions to targets and computes the losses.
 
@@ -428,7 +422,8 @@ def create_detection_layer(
     size_range: float = 4.0,
     ignore_bg_threshold: float = 0.7,
     overlap_func: Union[str, Callable] = "ciou",
-    predict_overlap: float = 1.0,
+    predict_overlap: Optional[float] = None,
+    label_smoothing: Optional[float] = None,
     overlap_loss_multiplier: float = 5.0,
     confidence_loss_multiplier: float = 1.0,
     class_loss_multiplier: float = 1.0,
@@ -457,9 +452,11 @@ def create_detection_layer(
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou" (default).
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -473,8 +470,8 @@ def create_detection_layer(
     """
     matching_func: Union[ShapeMatching, SimOTAMatching]
     if matching_algorithm == "simota":
-        loss_func = LossFunction(
-            overlap_func, None, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
+        loss_func = YOLOLoss(
+            overlap_func, None, None, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
         )
         matching_func = SimOTAMatching(prior_shapes, prior_shape_idxs, loss_func, spatial_range, size_range)
     elif matching_algorithm == "size":
@@ -490,9 +487,13 @@ def create_detection_layer(
     else:
         raise ValueError(f"Matching algorithm `{matching_algorithm}´ is unknown.")
 
-    loss_func = LossFunction(
-        overlap_func, predict_overlap, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
+    loss_func = YOLOLoss(
+        overlap_func,
+        predict_overlap,
+        label_smoothing,
+        overlap_loss_multiplier,
+        confidence_loss_multiplier,
+        class_loss_multiplier,
     )
-
     layer_shapes = [prior_shapes[i] for i in prior_shape_idxs]
     return DetectionLayer(prior_shapes=layer_shapes, matching_func=matching_func, loss_func=loss_func, **kwargs)
diff --git a/pl_bolts/models/detection/yolo/loss.py b/pl_bolts/models/detection/yolo/loss.py
index ba53653994..ded7143dc0 100644
--- a/pl_bolts/models/detection/yolo/loss.py
+++ b/pl_bolts/models/detection/yolo/loss.py
@@ -3,7 +3,7 @@
 
 import torch
 from torch import Tensor
-from torch.nn.functional import binary_cross_entropy, binary_cross_entropy_with_logits
+from torch.nn.functional import binary_cross_entropy, binary_cross_entropy_with_logits, one_hot
 
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
@@ -111,15 +111,16 @@ def _pairwise_confidence_loss(
 ) -> Tensor:
     """Calculates the confidence loss for every pair of a foreground anchor and a target.
 
-    If ``predict_overlap`` is ``True``, ``overlap`` will be used as the target confidence. Otherwise the target
-    confidence is 1. The method returns a matrix of losses for target/prediction pairs.
+    If ``predict_overlap`` is ``None``, the target confidence will be 1. If ``predict_overlap`` is 1.0, ``overlap`` will
+    be used as the target confidence. Otherwise this parameter defines a balance between these two targets. The method
+    returns a vector of losses for each foreground anchor.
 
     Args:
         preds: An ``[N]`` vector of predicted confidences.
         overlap: An ``[N, M]`` matrix of overlaps between all predicted and target bounding boxes.
         bce_func: A function for calculating binary cross entropy.
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the overlap.
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the overlap.
 
     Returns:
         An ``[N, M]`` matrix of confidence losses between all predictions and targets.
@@ -144,15 +145,16 @@ def _foreground_confidence_loss(
 ) -> Tensor:
     """Calculates the sum of the confidence losses for foreground anchors and their matched targets.
 
-    If ``predict_overlap`` is ``True``, ``overlap`` will be used as the target confidence. Otherwise the target
-    confidence is 1. The method returns a vector of losses for each foreground anchor.
+    If ``predict_overlap`` is ``None``, the target confidence will be 1. If ``predict_overlap`` is 1.0, ``overlap`` will
+    be used as the target confidence. Otherwise this parameter defines a balance between these two targets. The method
+    returns a vector of losses for each foreground anchor.
 
     Args:
         preds: A vector of predicted confidences.
         overlap: A vector of overlaps between matched target and predicted bounding boxes.
         bce_func: A function for calculating binary cross entropy.
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the overlap.
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1, and 1.0 means that the target confidence is the overlap.
 
     Returns:
         The sum of the confidence losses for foreground anchors.
@@ -179,14 +181,22 @@ def _background_confidence_loss(preds: Tensor, bce_func: Callable) -> Tensor:
     return bce_func(preds, targets, reduction="sum")
 
 
-def _target_labels_to_probs(targets: Tensor, num_classes: int, dtype: torch.dtype) -> Tensor:
+def _target_labels_to_probs(
+    targets: Tensor, num_classes: int, dtype: torch.dtype, label_smoothing: Optional[float] = None
+) -> Tensor:
     """If ``targets`` is a vector of class labels, converts it to a matrix of one-hot class probabilities.
 
+    If label smoothing is disabled, the returned target probabilities will be binary. If label smoothing is enabled, the
+    target probabilities will be, ``(label_smoothing / 2)`` or ``(label_smoothing / 2) + (1.0 - label_smoothing)``. That
+    corresponds to label smoothing with two categories, since the YOLO model does multi-label classification.
+
     Args:
         targets: An ``[M, C]`` matrix of target class probabilities or an ``[M]`` vector of class labels.
         num_classes: The number of classes (C dimension) for the new targets. If ``targets`` is already two-dimensional,
             checks that the length of the second dimension matches this number.
         dtype: Floating-point data type to be used for the one-hot targets.
+        label_smoothing: The epsilon parameter (weight) for label smoothing. 0.0 means no smoothing (binary targets),
+            and 1.0 means that the target probabilities are always 0.5.
 
     Returns:
         An ``[M, C]`` matrix of target class probabilities.
@@ -196,13 +206,16 @@ def _target_labels_to_probs(targets: Tensor, num_classes: int, dtype: torch.dtyp
         # greater than the number of predicted classes, it will be mapped to the last class.
         last_class = torch.tensor(num_classes - 1, device=targets.device)
         targets = torch.min(targets, last_class)
-        targets = torch.nn.functional.one_hot(targets, num_classes)
+        targets = one_hot(targets, num_classes)
     elif targets.shape[-1] != num_classes:
         raise ValueError(
             f"The number of classes in the data ({targets.shape[-1]}) doesn't match the number of classes "
             f"predicted by the model ({num_classes})."
         )
-    return targets.to(dtype=dtype)
+    targets = targets.to(dtype=dtype)
+    if label_smoothing is not None:
+        targets = (label_smoothing / 2) + targets * (1.0 - label_smoothing)
+    return targets
 
 
 @dataclass
@@ -212,16 +225,22 @@ class Losses:
     classification: Tensor
 
 
-class LossFunction:
+class YOLOLoss:
     """A class for calculating the YOLO losses from predictions and targets.
 
+    If label smoothing is enabled, the target class probabilities will be ``(label_smoothing / 2)`` or
+    ``(label_smoothing / 2) + (1.0 - label_smoothing)``, instead of 0 or 1. That corresponds to label smoothing with two
+    categories, since the YOLO model does multi-label classification.
+
     Args:
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou" (default).
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -231,6 +250,7 @@ def __init__(
         self,
         overlap_func: Union[str, Callable] = "ciou",
         predict_overlap: Optional[float] = None,
+        label_smoothing: Optional[float] = None,
         overlap_multiplier: float = 5.0,
         confidence_multiplier: float = 1.0,
         class_multiplier: float = 1.0,
@@ -242,6 +262,7 @@ def __init__(
             self._pairwise_overlap, self._elementwise_overlap_loss = _get_iou_and_loss_functions(overlap_func)
 
         self.predict_overlap = predict_overlap
+        self.label_smoothing = label_smoothing
         self.overlap_multiplier = overlap_multiplier
         self.confidence_multiplier = confidence_multiplier
         self.class_multiplier = class_multiplier
@@ -282,7 +303,9 @@ def pairwise(
         assert confidence_loss.shape == loss_shape
 
         pred_probs = preds["classprobs"].unsqueeze(1)  # [N, 1, classes]
-        target_probs = _target_labels_to_probs(targets["labels"], pred_probs.shape[-1], pred_probs.dtype)
+        target_probs = _target_labels_to_probs(
+            targets["labels"], pred_probs.shape[-1], pred_probs.dtype, self.label_smoothing
+        )
         target_probs = target_probs.unsqueeze(0)  # [1, M, classes]
         pred_probs, target_probs = torch.broadcast_tensors(pred_probs, target_probs)
         class_loss = bce_func(pred_probs, target_probs, reduction="none").sum(-1)
@@ -328,7 +351,9 @@ def elementwise_sums(
         confidence_loss += _background_confidence_loss(preds["bg_confidences"], bce_func)
 
         pred_probs = preds["classprobs"]
-        target_probs = _target_labels_to_probs(targets["labels"], pred_probs.shape[-1], pred_probs.dtype)
+        target_probs = _target_labels_to_probs(
+            targets["labels"], pred_probs.shape[-1], pred_probs.dtype, self.label_smoothing
+        )
         class_loss = bce_func(pred_probs, target_probs, reduction="sum")
 
         losses = Losses(
diff --git a/pl_bolts/models/detection/yolo/target_matching.py b/pl_bolts/models/detection/yolo/target_matching.py
index b566d6cdee..20951dc8f1 100644
--- a/pl_bolts/models/detection/yolo/target_matching.py
+++ b/pl_bolts/models/detection/yolo/target_matching.py
@@ -4,39 +4,18 @@
 import torch
 from torch import Tensor
 
-from pl_bolts.models.detection.yolo.loss import LossFunction
-from pl_bolts.models.detection.yolo.utils import aligned_iou, grid_centers, iou_below, is_inside_box
 from pl_bolts.utils import _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
 
+from .loss import YOLOLoss
+from .utils import aligned_iou, box_size_ratio, grid_centers, iou_below, is_inside_box
+
 if _TORCHVISION_AVAILABLE:
     from torchvision.ops import box_convert
 else:
     warn_missing_pkg("torchvision")
 
 
-def _compare_box_sizes(wh1: Tensor, wh2: Tensor, threshold: float) -> Tensor:
-    """Compares the dimensions of the boxes pairwise and returns a mask that indicates which pairs have similar
-    sizes.
-
-    For each pair of boxes, calculates the largest ratio that can be obtained by dividing the widths with each other or
-    dividing the heights with each other. Returns a mask that indicates which pairs have a ratio less than the given
-    threshold.
-
-    Args:
-        wh1: An ``[N, 2]`` matrix of box shapes (width and height).
-        wh2: An ``[M, 2]`` matrix of box shapes (width and height).
-        threshold: A threshold for the size ratio.
-
-    Returns:
-        An ``[N, M]`` matrix of truth values indicating which box pairs have the maximum size ratio below the threshold.
-    """
-    wh_ratio = wh1[:, None, :] / wh2[None, :, :]  # [M, N, 2]
-    wh_ratio = torch.max(wh_ratio, 1.0 / wh_ratio)
-    wh_ratio = wh_ratio.max(2).values  # [M, N]
-    return wh_ratio < threshold
-
-
 class ShapeMatching(ABC):
     """Selects which anchors are used to predict each target, by comparing the shape of the target box to a set of
     prior shapes.
@@ -218,7 +197,7 @@ def __init__(
 
     def match(self, wh: Tensor) -> Union[Tuple[Tensor, Tensor], Tensor]:
         prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
-        return _compare_box_sizes(wh, prior_wh, self.threshold).nonzero().T
+        return (box_size_ratio(wh, prior_wh) < self.threshold).nonzero().T
 
 
 def _sim_ota_match(costs: Tensor, ious: Tensor) -> Tuple[Tensor, Tensor]:
@@ -274,7 +253,7 @@ class SimOTAMatching:
             network input resolution.
         prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
             this layer uses.
-        loss_func: A ``LossFunction`` object that can be used to calculate the pairwise costs.
+        loss_func: A ``YOLOLoss`` object that can be used to calculate the pairwise costs.
         spatial_range: For each target, restrict to the anchors that are within an `N × N` grid cell are centered at the
             target, where `N` is the value of this parameter.
         size_range: For each target, restrict to the anchors whose prior dimensions are not larger than the target
@@ -285,7 +264,7 @@ def __init__(
         self,
         prior_shapes: Sequence[Tuple[int, int]],
         prior_shape_idxs: Sequence[int],
-        loss_func: LossFunction,
+        loss_func: YOLOLoss,
         spatial_range: float,
         size_range: float,
     ) -> None:
@@ -367,8 +346,8 @@ def _get_prior_mask(
 
         # Create a [boxes_per_cell, targets] tensor for selecting prior shapes that are close enough to the target
         # dimensions.
-        prior_wh = torch.tensor(self.prior_shapes, device=targets["boxes"].device)  # XXX Enable size filtering.
-        shape_selector = _compare_box_sizes(prior_wh, wh, self.size_range)  # XXX Enable size filtering.
+        prior_wh = torch.tensor(self.prior_shapes, device=targets["boxes"].device)
+        shape_selector = box_size_ratio(prior_wh, wh) < self.size_range
 
         # Create a [grid_cells, targets] tensor for selecting spatial locations that are inside target bounding boxes.
         centers = grid_centers(grid_size).view(-1, 2) * grid_to_image
@@ -377,7 +356,7 @@ def _get_prior_mask(
         # Combine the above selectors into a [grid_cells, boxes_per_cell, targets] tensor for selecting anchors that are
         # inside target bounding boxes and close enough shape.
         inside_selector = inside_selector[:, None, :].repeat(1, boxes_per_cell, 1)
-        inside_selector = torch.logical_and(inside_selector, shape_selector)  # XXX Enable size filtering.
+        inside_selector = torch.logical_and(inside_selector, shape_selector)
 
         # Set the width and height of all target bounding boxes to self.range grid cells and create a selector for
         # anchors that are now inside the boxes. If a small target has no anchors inside its bounding box, it will be
@@ -391,7 +370,7 @@ def _get_prior_mask(
         # Create a [grid_cells, boxes_per_cell, targets] tensor for selecting anchors that are spatially close to a
         # target and whose shape is close enough to the target.
         close_selector = close_selector[:, None, :].repeat(1, boxes_per_cell, 1)
-        close_selector = torch.logical_and(close_selector, shape_selector)  # XXX Enable size filtering.
+        close_selector = torch.logical_and(close_selector, shape_selector)
 
         mask = torch.logical_or(inside_selector, close_selector).sum(-1) > 0
         mask = mask.view(grid_height, grid_width, boxes_per_cell)
diff --git a/pl_bolts/models/detection/yolo/torch_networks.py b/pl_bolts/models/detection/yolo/torch_networks.py
index 57abab77b0..9b18a1ddcb 100644
--- a/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/pl_bolts/models/detection/yolo/torch_networks.py
@@ -1,19 +1,19 @@
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Sequence, Tuple
+from typing import Any, List, Optional, Sequence, Tuple
 
 import torch
 import torch.nn as nn
 from torch import Tensor
 
-from pl_bolts.models.detection.yolo.layers import Conv, DetectionLayer, MaxPool, ReOrg, create_detection_layer
-from pl_bolts.models.detection.yolo.types import NETWORK_OUTPUT, TARGETS
-from pl_bolts.models.detection.yolo.utils import get_image_size
+from .layers import Conv, DetectionLayer, MaxPool, ReOrg, create_detection_layer
+from .types import NETWORK_OUTPUT, TARGETS
+from .utils import get_image_size
 
 
 def run_detection(
     detection_layer: DetectionLayer,
     layer_input: Tensor,
-    targets: Optional[List[Dict[str, Tensor]]],
+    targets: Optional[TARGETS],
     image_size: Tensor,
     detections: List[Tensor],
     losses: List[Tensor],
@@ -46,7 +46,7 @@ def run_detection_with_aux_head(
     aux_detection_layer: DetectionLayer,
     layer_input: Tensor,
     aux_input: Tensor,
-    targets: Optional[List[Dict[str, Tensor]]],
+    targets: Optional[TARGETS],
     image_size: Tensor,
     aux_weight: float,
     detections: List[Tensor],
@@ -131,7 +131,8 @@ class TinyStage(nn.Module):
     """One stage of the "tiny" network architecture from YOLOv4.
 
     Args:
-        num_channels: Number of channels in the input and output of the stage.
+        num_channels: Number of channels in the input of the stage. Partial output will have as many channels and full
+            output will have twice as many channels.
         activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
             "linear", or "none".
         norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
@@ -150,11 +151,13 @@ def __init__(
         self.conv2 = Conv(hidden_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
         self.mix = Conv(num_channels, num_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
 
-    def forward(self, x: Tensor) -> Tensor:
-        x = torch.chunk(x, 2, dim=1)[1]
-        y1 = self.conv1(x)
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        partial = torch.chunk(x, 2, dim=1)[1]
+        y1 = self.conv1(partial)
         y2 = self.conv2(y1)
-        return self.mix(torch.cat((y2, y1), dim=1))
+        partial_output = self.mix(torch.cat((y2, y1), dim=1))
+        full_output = torch.cat((x, partial_output), dim=1)
+        return partial_output, full_output
 
 
 class CSPStage(nn.Module):
@@ -385,8 +388,10 @@ def smooth(num_channels: int) -> nn.Module:
             return Conv(num_channels, num_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            conv = Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
-            return nn.Sequential(OrderedDict([("downsample", conv), ("smooth", smooth(out_channels))]))
+            conv_module = Conv(
+                in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization
+            )
+            return nn.Sequential(OrderedDict([("downsample", conv_module), ("smooth", smooth(out_channels))]))
 
         def maxpool(out_channels: int) -> nn.Module:
             return nn.Sequential(
@@ -399,27 +404,29 @@ def maxpool(out_channels: int) -> nn.Module:
                 )
             )
 
-        self.stage1 = Conv(in_channels, width, kernel_size=3, stride=2, activation=activation, norm=normalization)
-        self.downsample2 = downsample(width, width * 2)
-        self.stage2 = TinyStage(width * 2, activation=activation, norm=normalization)
-        self.downsample3 = maxpool(width * 4)
-        self.stage3 = TinyStage(width * 4, activation=activation, norm=normalization)
-        self.downsample4 = maxpool(width * 8)
-        self.stage4 = TinyStage(width * 8, activation=activation, norm=normalization)
-        self.downsample5 = maxpool(width * 16)
+        def stage(out_channels: int, use_maxpool: bool) -> nn.Module:
+            if use_maxpool:
+                downsample_module = maxpool(out_channels)
+            else:
+                downsample_module = downsample(out_channels // 2, out_channels)
+            stage_module = TinyStage(out_channels, activation=activation, norm=normalization)
+            return nn.Sequential(OrderedDict([("downsample", downsample_module), ("stage", stage_module)]))
+
+        stages = [
+            Conv(in_channels, width, kernel_size=3, stride=2, activation=activation, norm=normalization),
+            stage(width * 2, False),
+            stage(width * 4, True),
+            stage(width * 8, True),
+            maxpool(width * 16),
+        ]
+        self.stages = nn.ModuleList(stages)
 
     def forward(self, x: Tensor) -> List[Tensor]:
-        c1 = self.stage1(x)
-        x = self.downsample2(c1)
-        c2 = self.stage2(x)
-        x = torch.cat((x, c2), dim=1)
-        x = self.downsample3(x)
-        c3 = self.stage3(x)
-        x = torch.cat((x, c3), dim=1)
-        x = self.downsample4(x)
-        c4 = self.stage4(x)
-        x = torch.cat((x, c4), dim=1)
-        c5 = self.downsample5(x)
+        c1 = self.stages[0](x)
+        c2, x = self.stages[1](c1)
+        c3, x = self.stages[2](x)
+        c4, x = self.stages[3](x)
+        c5 = self.stages[4](x)
         return [c1, c2, c3, c4, c5]
 
 
@@ -538,18 +545,21 @@ def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
                 )
             )
 
-        self.stage1 = downsample(in_channels, width, kernel_size=6)
-        self.stage2 = stage(width, width * 2, depth)
-        self.stage3 = stage(width * 2, width * 4, depth * 2)
-        self.stage4 = stage(width * 4, width * 8, depth * 3)
-        self.stage5 = stage(width * 8, width * 16, depth)
+        stages = [
+            downsample(in_channels, width, kernel_size=6),
+            stage(width, width * 2, depth),
+            stage(width * 2, width * 4, depth * 2),
+            stage(width * 4, width * 8, depth * 3),
+            stage(width * 8, width * 16, depth),
+        ]
+        self.stages = nn.ModuleList(stages)
 
     def forward(self, x: Tensor) -> List[Tensor]:
-        c1 = self.stage1(x)
-        c2 = self.stage2(c1)
-        c3 = self.stage3(c2)
-        c4 = self.stage4(c3)
-        c5 = self.stage5(c4)
+        c1 = self.stages[0](x)
+        c2 = self.stages[1](c1)
+        c3 = self.stages[2](c2)
+        c4 = self.stages[3](c3)
+        c5 = self.stages[4](c4)
         return [c1, c2, c3, c4, c5]
 
 
@@ -647,9 +657,11 @@ class YOLOV4TinyNetwork(nn.Module):
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou" (default).
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -781,9 +793,11 @@ class YOLOV4Network(nn.Module):
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou" (default).
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -946,9 +960,11 @@ class YOLOV4P6Network(nn.Module):
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou" (default).
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -1140,9 +1156,11 @@ class YOLOV5Network(nn.Module):
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou" (default).
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -1301,9 +1319,11 @@ class YOLOV7Network(nn.Module):
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou" (default).
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
@@ -1315,7 +1335,7 @@ def __init__(
         self,
         num_classes: int,
         backbone: Optional[nn.Module] = None,
-        widths: Sequence[int] = (32, 64, 128, 256, 512, 1024, 1024),
+        widths: Sequence[int] = (64, 128, 256, 512, 768, 1024),
         activation: Optional[str] = "silu",
         normalization: Optional[str] = "batchnorm",
         prior_shapes: Optional[List[Tuple[int, int]]] = None,
@@ -1615,9 +1635,11 @@ class YOLOXNetwork(nn.Module):
         overlap_func: A function for calculating the pairwise overlaps between two sets of boxes. Either a string or a
             function that returns a matrix of pairwise overlaps. Valid string values are "iou", "giou", "diou", and
             "ciou" (default).
-        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that target
-            confidence is one if there's an object, and 1.0 means that the target confidence is the output of
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
             ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
         overlap_loss_multiplier: Overlap loss will be scaled by this value.
         confidence_loss_multiplier: Confidence loss will be scaled by this value.
         class_loss_multiplier: Classification loss will be scaled by this value.
diff --git a/pl_bolts/models/detection/yolo/types.py b/pl_bolts/models/detection/yolo/types.py
index 8a37e72a89..e6c282f8e9 100644
--- a/pl_bolts/models/detection/yolo/types.py
+++ b/pl_bolts/models/detection/yolo/types.py
@@ -1,7 +1,11 @@
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, Union
 
 from torch import Tensor
 
+IMAGES = Union[Tuple[Tensor, ...], List[Tensor]]
+PRED = Dict[str, Any]
+PREDS = Union[Tuple[PRED, ...], List[PRED]]
 TARGET = Dict[str, Any]
-TARGETS = List[TARGET]
+TARGETS = Union[Tuple[TARGET, ...], List[TARGET]]
+BATCH = Tuple[IMAGES, TARGETS]
 NETWORK_OUTPUT = Tuple[List[Tensor], List[Tensor], List[int]]  # detections, losses, hits
diff --git a/pl_bolts/models/detection/yolo/utils.py b/pl_bolts/models/detection/yolo/utils.py
index ee930706b0..2febe16208 100644
--- a/pl_bolts/models/detection/yolo/utils.py
+++ b/pl_bolts/models/detection/yolo/utils.py
@@ -133,6 +133,25 @@ def is_inside_box(points: Tensor, boxes: Tensor) -> Tensor:
     return deltas.min(-1).values > 0.0  # [points, boxes]
 
 
+def box_size_ratio(wh1: Tensor, wh2: Tensor) -> Tensor:
+    """Compares the dimensions of the boxes pairwise.
+
+    For each pair of boxes, calculates the largest ratio that can be obtained by dividing the widths with each other or
+    dividing the heights with each other.
+
+    Args:
+        wh1: An ``[N, 2]`` matrix of box shapes (width and height).
+        wh2: An ``[M, 2]`` matrix of box shapes (width and height).
+
+    Returns:
+        An ``[N, M]`` matrix of ratios of width or height dimensions, whichever is larger.
+    """
+    wh_ratio = wh1[:, None, :] / wh2[None, :, :]  # [M, N, 2]
+    wh_ratio = torch.max(wh_ratio, 1.0 / wh_ratio)
+    wh_ratio = wh_ratio.max(2).values  # [M, N]
+    return wh_ratio
+
+
 @torch.jit.script
 def get_image_size(images: Tensor) -> Tensor:
     """Get the image size from an input tensor.
diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 3699f5f78a..8de72fc2c2 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -5,25 +5,26 @@
 import torch.nn as nn
 from pytorch_lightning import LightningModule
 from pytorch_lightning.utilities.cli import LightningCLI
-from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
+from pytorch_lightning.utilities.types import STEP_OUTPUT
 from torch import Tensor, optim
 
 # It seems to be impossible to avoid mypy errors if using import instead of getattr().
 # See https://github.com/python/mypy/issues/8823
 try:
     LRScheduler: Any = getattr(optim.lr_scheduler, "LRScheduler")
-except ImportError:
+except AttributeError:
     LRScheduler = getattr(optim.lr_scheduler, "_LRScheduler")
 
 from pl_bolts.datamodules import VOCDetectionDataModule
 from pl_bolts.datamodules.vocdetection_datamodule import Compose
-from pl_bolts.models.detection.yolo.darknet_network import DarknetNetwork
-from pl_bolts.models.detection.yolo.torch_networks import YOLOV4Network
-from pl_bolts.models.detection.yolo.types import TARGET, TARGETS
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
 from pl_bolts.utils import _TORCHMETRICS_DETECTION_AVAILABLE, _TORCHVISION_AVAILABLE
 from pl_bolts.utils.warnings import warn_missing_pkg
 
+from .darknet_network import DarknetNetwork
+from .torch_networks import YOLOV4Network
+from .types import BATCH, IMAGES, PRED, PREDS, TARGET, TARGETS
+
 if _TORCHMETRICS_DETECTION_AVAILABLE:
     try:
         from torchmetrics.detection import MeanAveragePrecision
@@ -41,30 +42,34 @@
     warn_missing_pkg("torchvision")
 
 
-def validate_batch(batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS]:
-    """Validates the format of a batch of data and stacks the images into a single tensor.
+def validate_batch(images: Union[Tensor, IMAGES], targets: Optional[TARGETS]) -> None:
+    """Validates the format of a batch of data.
 
     Args:
-        batch: A batch of data read by a :class:`~torch.utils.data.DataLoader`.
-
-    Returns:
-        The input batch with images stacked into a single tensor.
+        images: A tensor containing a batch of images or a list of image tensors.
+        targets: A list of target dictionaries or ``None``. If a list is provided, there should be as many target
+            dictionaries as there are images.
     """
-    images, targets = batch
-
-    if not images:
-        raise ValueError("No images in batch.")
-
+    if not isinstance(images, Tensor):
+        if not isinstance(images, (tuple, list)):
+            raise TypeError(f"Expected images to be a Tensor, tuple, or a list, got {type(images).__name__}.")
+        if not images:
+            raise ValueError("No images in batch.")
+        shape = images[0].shape
+        for image in images:
+            if not isinstance(image, Tensor):
+                raise ValueError(f"Expected image to be of type Tensor, got {type(image).__name__}.")
+            if image.shape != shape:
+                raise ValueError(f"Images with different shapes in one batch: {shape} and {image.shape}")
+
+    if targets is None:
+        return
+
+    if not isinstance(targets, (tuple, list)):
+        raise TypeError(f"Expected targets to be a tuple or a list, got {type(images).__name__}.")
     if len(images) != len(targets):
         raise ValueError(f"Got {len(images)} images, but targets for {len(targets)} images.")
 
-    shape = images[0].shape
-    for image in images:
-        if not isinstance(image, Tensor):
-            raise ValueError(f"Expected image to be of type Tensor, got {type(image).__name__}.")
-        if image.shape != shape:
-            raise ValueError(f"Images with different shapes in one batch: {shape} and {image.shape}")
-
     for target in targets:
         boxes = target["boxes"]
         if not isinstance(boxes, Tensor):
@@ -79,8 +84,6 @@ def validate_batch(batch: Tuple[List[Tensor], TARGETS]) -> Tuple[Tensor, TARGETS
                 f"Expected target labels to be tensors of shape [N] or [N, num_classes], got {list(labels.shape)}."
             )
 
-    return torch.stack(images), targets
-
 
 class YOLO(LightningModule):
     """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4, YOLOv5,
@@ -104,13 +107,13 @@ class YOLO(LightningModule):
     saved by Darknet. See the :class:`~.yolo_module.CLIYOLO` command-line application for an example of how to specify
     a network architecture.
 
-    The input from the data loader is expected to be a list of images. Each image is a tensor with shape
-    ``[channels, height, width]``. The images from a single batch will be stacked into a single tensor, so the sizes
-    have to match. Different batches can have different image sizes, as long as the size is divisible by the ratio in
-    which the network downsamples the input.
+    The input is expected to be a list of images. Each image is a tensor with shape ``[channels, height, width]``. The
+    images from a single batch will be stacked into a single tensor, so the sizes have to match. Different batches can
+    have different image sizes, as long as the size is divisible by the ratio in which the network downsamples the
+    input.
 
     During training, the model expects both the image tensors and a list of targets. It's possible to train a model
-    using one integer class label per box, but the YOLO model supports also multiple classes per box. For multi-class
+    using one integer class label per box, but the YOLO model supports also multiple labels per box. For multi-label
     training, simply use a boolean matrix that indicates which classes are assigned to which boxes, in place of the
     class labels. *Each target is a dictionary containing the following tensors*:
 
@@ -181,7 +184,9 @@ def __init__(
             self._val_map = MeanAveragePrecision()
             self._test_map = MeanAveragePrecision()
 
-    def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+    def forward(
+        self, images: Union[Tensor, IMAGES], targets: Optional[TARGETS] = None
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
         """Runs a forward pass through the network (all layers listed in ``self.network``), and if training targets
         are provided, computes the losses from the detection layers.
 
@@ -189,7 +194,8 @@ def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Te
         that depends on the size of the feature map and the number of anchors per feature map cell.
 
         Args:
-            images: Images to be processed. Tensor of size ``[batch_size, channels, height, width]``.
+            images: A tensor of size ``[batch_size, channels, height, width]`` containing a batch of images or a list of
+                image tensors.
             targets: If given, computes losses from detection layers against these targets. A list of target
                 dictionaries, one for each image.
 
@@ -199,7 +205,9 @@ def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Te
             ``anchors`` is the feature map size (width * height) times the number of anchors per cell. The predicted box
             coordinates are in `(x1, y1, x2, y2)` format and scaled to the input image size.
         """
-        detections, losses, hits = self.network(images, targets)
+        validate_batch(images, targets)
+        images_tensor = images if isinstance(images, Tensor) else torch.stack(images)
+        detections, losses, hits = self.network(images_tensor, targets)
 
         detections = torch.cat(detections, 1)
         if targets is None:
@@ -208,7 +216,7 @@ def forward(self, images: Tensor, targets: Optional[TARGETS] = None) -> Union[Te
         total_hits = sum(hits)
         for layer_idx, layer_hits in enumerate(hits):
             hit_rate: Union[Tensor, float] = torch.true_divide(layer_hits, total_hits) if total_hits > 0 else 1.0
-            self.log(f"layer_{layer_idx}_hit_rate", hit_rate, sync_dist=True, batch_size=images.size(0))
+            self.log(f"layer_{layer_idx}_hit_rate", hit_rate, sync_dist=True, batch_size=len(images))
 
         losses = torch.stack(losses).sum(0)
         return detections, losses
@@ -245,7 +253,7 @@ def configure_optimizers(self) -> Tuple[List[optim.Optimizer], List[LRScheduler]
         lr_scheduler = self.lr_scheduler_class(optimizer, **self.lr_scheduler_params)
         return [optimizer], [lr_scheduler]
 
-    def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> STEP_OUTPUT:
+    def training_step(self, batch: BATCH, batch_idx: int) -> STEP_OUTPUT:
         """Computes the training loss.
 
         Args:
@@ -256,7 +264,7 @@ def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) ->
         Returns:
             A dictionary that includes the training loss in 'loss'.
         """
-        images, targets = validate_batch(batch)
+        images, targets = batch
         _, losses = self(images, targets)
 
         self.log("train/overlap_loss", losses[0], prog_bar=True, sync_dist=True)
@@ -266,7 +274,7 @@ def training_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) ->
 
         return {"loss": losses.sum()}
 
-    def validation_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Optional[STEP_OUTPUT]:
+    def validation_step(self, batch: BATCH, batch_idx: int) -> Optional[STEP_OUTPUT]:
         """Evaluates a batch of data from the validation set.
 
         Args:
@@ -274,32 +282,31 @@ def validation_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -
                 dictionaries.
             batch_idx: Index of the current batch.
         """
-        images, targets = validate_batch(batch)
+        images, targets = batch
         detections, losses = self(images, targets)
 
-        self.log("val/overlap_loss", losses[0], sync_dist=True, batch_size=images.size(0))
-        self.log("val/confidence_loss", losses[1], sync_dist=True, batch_size=images.size(0))
-        self.log("val/class_loss", losses[2], sync_dist=True, batch_size=images.size(0))
-        self.log("val/total_loss", losses.sum(), sync_dist=True, batch_size=images.size(0))
+        self.log("val/overlap_loss", losses[0], sync_dist=True, batch_size=len(images))
+        self.log("val/confidence_loss", losses[1], sync_dist=True, batch_size=len(images))
+        self.log("val/class_loss", losses[2], sync_dist=True, batch_size=len(images))
+        self.log("val/total_loss", losses.sum(), sync_dist=True, batch_size=len(images))
 
         if _MEAN_AVERAGE_PRECISION_AVAILABLE:
             detections = self.process_detections(detections)
             targets = self.process_targets(targets)
             self._val_map.update(detections, targets)
 
-    def validation_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
-        # When continuing training from a checkpoint, it may happen that epoch_end is called without outputs. In this
+    def on_validation_epoch_end(self) -> None:
+        # When continuing training from a checkpoint, it may happen that epoch_end is called without detections. In this
         # case the metrics cannot be computed.
-        if not outputs:
+        if (not _MEAN_AVERAGE_PRECISION_AVAILABLE) or (not self._val_map.detection_boxes):
             return
 
-        if _MEAN_AVERAGE_PRECISION_AVAILABLE:
-            map_scores = self._val_map.compute()
-            map_scores = {"val/" + k: v for k, v in map_scores.items()}
-            self.log_dict(map_scores, sync_dist=True)
-            self._val_map.reset()
+        map_scores = self._val_map.compute()
+        map_scores = {"val/" + k: v for k, v in map_scores.items()}
+        self.log_dict(map_scores, sync_dist=True)
+        self._val_map.reset()
 
-    def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Optional[STEP_OUTPUT]:
+    def test_step(self, batch: BATCH, batch_idx: int) -> Optional[STEP_OUTPUT]:
         """Evaluates a batch of data from the test set.
 
         Args:
@@ -307,7 +314,7 @@ def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Opti
                 dictionaries.
             batch_idx: Index of the current batch.
         """
-        images, targets = validate_batch(batch)
+        images, targets = batch
         detections, losses = self(images, targets)
 
         self.log("test/overlap_loss", losses[0], sync_dist=True)
@@ -320,19 +327,18 @@ def test_step(self, batch: Tuple[List[Tensor], TARGETS], batch_idx: int) -> Opti
             targets = self.process_targets(targets)
             self._test_map.update(detections, targets)
 
-    def test_epoch_end(self, outputs: Union[EPOCH_OUTPUT, List[EPOCH_OUTPUT]]) -> None:
-        # When continuing training from a checkpoint, it may happen that epoch_end is called without outputs. In this
+    def on_test_epoch_end(self) -> None:
+        # When continuing training from a checkpoint, it may happen that epoch_end is called without detections. In this
         # case the metrics cannot be computed.
-        if not outputs:
+        if (not _MEAN_AVERAGE_PRECISION_AVAILABLE) or (not self._test_map.detection_boxes):
             return
 
-        if _MEAN_AVERAGE_PRECISION_AVAILABLE:
-            map_scores = self._test_map.compute()
-            map_scores = {"test/" + k: v for k, v in map_scores.items()}
-            self.log_dict(map_scores, sync_dist=True)
-            self._test_map.reset()
+        map_scores = self._test_map.compute()
+        map_scores = {"test/" + k: v for k, v in map_scores.items()}
+        self.log_dict(map_scores, sync_dist=True)
+        self._test_map.reset()
 
-    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> List[Dict[str, Tensor]]:
+    def predict_step(self, batch: BATCH, batch_idx: int, dataloader_idx: int = 0) -> PREDS:
         """Feeds a batch of images to the network and returns the detected bounding boxes, confidence scores, and
         class labels.
 
@@ -349,12 +355,12 @@ class labels.
             bounding box `(x1, y1, x2, y2)` coordinates. "scores" is a vector of confidence scores for the bounding box
             detections. "labels" is a vector of predicted class labels.
         """
-        images, _ = validate_batch(batch)
+        images, _ = batch
         detections = self(images)
         detections = self.process_detections(detections)
         return detections
 
-    def infer(self, image: Tensor) -> Dict[str, Tensor]:
+    def infer(self, image: Tensor) -> PRED:
         """Feeds an image to the network and returns the detected bounding boxes, confidence scores, and class
         labels.
 
@@ -374,7 +380,7 @@ def infer(self, image: Tensor) -> Dict[str, Tensor]:
         was_training = self.training
         self.eval()
 
-        detections = self(image.unsqueeze(0))
+        detections = self([image])
         detections = self.process_detections(detections)
         detections = detections[0]
 
@@ -382,7 +388,7 @@ def infer(self, image: Tensor) -> Dict[str, Tensor]:
             self.train()
         return detections
 
-    def process_detections(self, preds: Tensor) -> List[Dict[str, Tensor]]:
+    def process_detections(self, preds: Tensor) -> PREDS:
         """Splits the detection tensor returned by a forward pass into a list of prediction dictionaries, and
         filters them based on confidence threshold, non-maximum suppression (NMS), and maximum number of
         predictions.
@@ -419,7 +425,7 @@ def process(boxes: Tensor, confidences: Tensor, classprobs: Tensor) -> Dict[str,
 
         return [process(p[..., :4], p[..., 4], p[..., 5:]) for p in preds]
 
-    def process_targets(self, targets: TARGETS) -> TARGETS:
+    def process_targets(self, targets: TARGETS) -> List[TARGET]:
         """Duplicates multi-label targets to create one target for each label.
 
         Args:

From 2e552365a0c3b760ee15d2e832a57493b56c7f9f Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Tue, 4 Apr 2023 12:12:19 +0300
Subject: [PATCH 65/76] Added unit tests for YOLOv7 and box_size_ratio()

---
 pl_bolts/models/detection/yolo/yolo_module.py |  8 +-
 tests/models/test_detection.py                | 71 +++++++++++++++---
 .../models/yolo/unit/test_darknet_network.py  |  8 +-
 tests/models/yolo/unit/test_utils.py          | 74 +++++++++++--------
 4 files changed, 111 insertions(+), 50 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 8de72fc2c2..c97d2e75df 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -23,7 +23,7 @@
 
 from .darknet_network import DarknetNetwork
 from .torch_networks import YOLOV4Network
-from .types import BATCH, IMAGES, PRED, PREDS, TARGET, TARGETS
+from .types import BATCH, IMAGES, PRED, TARGET, TARGETS
 
 if _TORCHMETRICS_DETECTION_AVAILABLE:
     try:
@@ -126,7 +126,7 @@ class labels. *Each target is a dictionary containing the following tensors*:
     coordinates are scaled to the input image size. During training it also returns a dictionary containing the
     classification, box overlap, and confidence losses.
 
-    During inference, the model requires only the image tensors. :func:`~.yolo_module.YOLO.infer` method filters and
+    During inference, the model requires only the image tensor. :func:`~.yolo_module.YOLO.infer` method filters and
     processes the predictions. If a prediction has a high score for more than one class, it will be duplicated. *The
     processed output is returned in a dictionary containing the following tensors*:
 
@@ -338,7 +338,7 @@ def on_test_epoch_end(self) -> None:
         self.log_dict(map_scores, sync_dist=True)
         self._test_map.reset()
 
-    def predict_step(self, batch: BATCH, batch_idx: int, dataloader_idx: int = 0) -> PREDS:
+    def predict_step(self, batch: BATCH, batch_idx: int, dataloader_idx: int = 0) -> List[PRED]:
         """Feeds a batch of images to the network and returns the detected bounding boxes, confidence scores, and
         class labels.
 
@@ -388,7 +388,7 @@ def infer(self, image: Tensor) -> PRED:
             self.train()
         return detections
 
-    def process_detections(self, preds: Tensor) -> PREDS:
+    def process_detections(self, preds: Tensor) -> List[PRED]:
         """Splits the detection tensor returned by a forward pass into a list of prediction dictionaries, and
         filters them based on confidence threshold, non-maximum suppression (NMS), and maximum number of
         predictions.
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 933d0157e3..b9e0926846 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -17,6 +17,7 @@
     YOLOV4P6Network,
     YOLOV4TinyNetwork,
     YOLOV5Network,
+    YOLOV7Network,
     YOLOXNetwork,
 )
 from pl_bolts.models.detection.faster_rcnn import create_fasterrcnn_backbone
@@ -96,8 +97,11 @@ def test_darknet(config, catch_warnings):
     network = DarknetNetwork(config_path)
     model = YOLO(network)
 
-    image = torch.rand(1, 3, 256, 256)
-    model(image)
+    image = torch.rand(3, 256, 256)
+    detections = model.infer(image)
+    assert "boxes" in detections
+    assert "scores" in detections
+    assert "labels" in detections
 
 
 @pytest.mark.parametrize(
@@ -129,8 +133,11 @@ def test_yolov4_tiny(catch_warnings):
     network = YOLOV4TinyNetwork(num_classes=2, width=4, overlap_func="giou")
     model = YOLO(network)
 
-    image = torch.rand(1, 3, 256, 256)
-    model(image)
+    image = torch.rand(3, 256, 256)
+    detections = model.infer(image)
+    assert "boxes" in detections
+    assert "scores" in detections
+    assert "labels" in detections
 
 
 def test_yolov4_tiny_train(tmpdir):
@@ -154,8 +161,11 @@ def test_yolov4(catch_warnings):
     network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
     model = YOLO(network)
 
-    image = torch.rand(1, 3, 256, 256)
-    model(image)
+    image = torch.rand(3, 256, 256)
+    detections = model.infer(image)
+    assert "boxes" in detections
+    assert "scores" in detections
+    assert "labels" in detections
 
 
 def test_yolov4_train(tmpdir, catch_warnings):
@@ -179,8 +189,11 @@ def test_yolov4p6(catch_warnings):
     network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128), overlap_func="giou")
     model = YOLO(network)
 
-    image = torch.rand(1, 3, 256, 256)
-    model(image)
+    image = torch.rand(3, 256, 256)
+    detections = model.infer(image)
+    assert "boxes" in detections
+    assert "scores" in detections
+    assert "labels" in detections
 
 
 def test_yolov4p6_train(tmpdir, catch_warnings):
@@ -204,8 +217,11 @@ def test_yolov5(catch_warnings):
     network = YOLOV5Network(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network)
 
-    image = torch.rand(1, 3, 256, 256)
-    model(image)
+    image = torch.rand(3, 256, 256)
+    detections = model.infer(image)
+    assert "boxes" in detections
+    assert "scores" in detections
+    assert "labels" in detections
 
 
 def test_yolov5_train(tmpdir, catch_warnings):
@@ -225,12 +241,43 @@ def test_yolov5_train(tmpdir, catch_warnings):
     trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
 
 
+def test_yolov7(catch_warnings):
+    network = YOLOV7Network(num_classes=2, depth=1, width=4, overlap_func="giou")
+    model = YOLO(network)
+
+    image = torch.rand(3, 256, 256)
+    detections = model.infer(image)
+    assert "boxes" in detections
+    assert "scores" in detections
+    assert "labels" in detections
+
+
+def test_yolov7_train(tmpdir, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
+    network = YOLOV7Network(num_classes=2, depth=1, width=4, overlap_func="giou")
+    model = YOLO(network)
+
+    train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+    valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
+
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, logger=False, max_epochs=10, accelerator="auto")
+    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)
+
+
 def test_yolox(catch_warnings):
     network = YOLOXNetwork(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network)
 
-    image = torch.rand(1, 3, 256, 256)
-    model(image)
+    image = torch.rand(3, 256, 256)
+    detections = model.infer(image)
+    assert "boxes" in detections
+    assert "scores" in detections
+    assert "labels" in detections
 
 
 def test_yolox_train(tmpdir, catch_warnings):
diff --git a/tests/models/yolo/unit/test_darknet_network.py b/tests/models/yolo/unit/test_darknet_network.py
index ad8aa0c9d8..9f493f8d64 100644
--- a/tests/models/yolo/unit/test_darknet_network.py
+++ b/tests/models/yolo/unit/test_darknet_network.py
@@ -34,21 +34,19 @@ def test_create_convolutional(config, catch_warnings):
     assert conv.conv.kernel_size == (config["size"], config["size"])
     assert conv.conv.stride == (config["stride"], config["stride"])
 
-    activation = config["activation"]
     pad_size = (config["size"] - 1) // 2 if config["pad"] else 0
-
     if config["pad"]:
         assert conv.conv.padding == (pad_size, pad_size)
 
     if config["batch_normalize"]:
         assert isinstance(conv.norm, nn.BatchNorm2d)
 
-    if activation == "linear":
+    if config["activation"] == "linear":
         assert isinstance(conv.act, nn.Identity)
-    elif activation == "logistic":
+    elif config["activation"] == "logistic":
         assert isinstance(conv.act, nn.Sigmoid)
     else:
-        assert conv.act.__class__.__name__.lower().startswith(activation)
+        assert conv.act.__class__.__name__.lower().startswith(config["activation"])
 
 
 @pytest.mark.parametrize(
diff --git a/tests/models/yolo/unit/test_utils.py b/tests/models/yolo/unit/test_utils.py
index 65e02e79ad..e8e1d02ad1 100644
--- a/tests/models/yolo/unit/test_utils.py
+++ b/tests/models/yolo/unit/test_utils.py
@@ -6,6 +6,7 @@
 
 from pl_bolts.models.detection.yolo.utils import (
     aligned_iou,
+    box_size_ratio,
     global_xy,
     grid_centers,
     grid_offsets,
@@ -51,6 +52,37 @@ def test_global_xy(catch_warnings):
     assert torch.all(xy[:, 3, :, :, 1] == 175)
 
 
+@pytest.mark.parametrize(
+    "dims1, dims2, expected_ious",
+    [
+        (
+            torch.tensor([[1.0, 1.0], [10.0, 1.0], [100.0, 10.0]]),
+            torch.tensor([[1.0, 10.0], [2.0, 20.0]]),
+            torch.tensor([[1.0 / 10.0, 1.0 / 40.0], [1.0 / 19.0, 2.0 / 48.0], [10.0 / 1000.0, 20.0 / 1020.0]]),
+        )
+    ],
+)
+def test_aligned_iou(dims1, dims2, expected_ious, catch_warnings):
+    warnings.filterwarnings(
+        "ignore",
+        message=".*does not have many workers which may be a bottleneck.*",
+        category=PossibleUserWarning,
+    )
+
+    torch.testing.assert_close(aligned_iou(dims1, dims2), expected_ious)
+
+
+def test_iou_below(catch_warnings):
+    tl = torch.rand((10, 10, 3, 2)) * 100
+    br = tl + 10
+    pred_boxes = torch.cat((tl, br), -1)
+    target_boxes = torch.stack((pred_boxes[1, 1, 0], pred_boxes[3, 5, 1]))
+    result = iou_below(pred_boxes, target_boxes, 0.9)
+    assert result.shape == (10, 10, 3)
+    assert not result[1, 1, 0]
+    assert not result[3, 5, 1]
+
+
 def test_is_inside_box(catch_warnings):
     """
     centers:
@@ -84,32 +116,16 @@ def test_is_inside_box(catch_warnings):
     assert torch.all(is_inside[4, 7:9, 1])
 
 
-@pytest.mark.parametrize(
-    "dims1, dims2, expected_ious",
-    [
-        (
-            torch.tensor([[1.0, 1.0], [10.0, 1.0], [100.0, 10.0]]),
-            torch.tensor([[1.0, 10.0], [2.0, 20.0]]),
-            torch.tensor([[1.0 / 10.0, 1.0 / 40.0], [1.0 / 19.0, 2.0 / 48.0], [10.0 / 1000.0, 20.0 / 1020.0]]),
-        )
-    ],
-)
-def test_aligned_iou(dims1, dims2, expected_ious, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    torch.testing.assert_close(aligned_iou(dims1, dims2), expected_ious)
-
-
-def test_iou_below(catch_warnings):
-    tl = torch.rand((10, 10, 3, 2)) * 100
-    br = tl + 10
-    pred_boxes = torch.cat((tl, br), -1)
-    target_boxes = torch.stack((pred_boxes[1, 1, 0], pred_boxes[3, 5, 1]))
-    result = iou_below(pred_boxes, target_boxes, 0.9)
-    assert result.shape == (10, 10, 3)
-    assert not result[1, 1, 0]
-    assert not result[3, 5, 1]
+def test_box_size_ratio(catch_warnings):
+    wh1 = torch.tensor([[24, 11], [12, 25], [26, 27], [15, 17]])
+    wh2 = torch.tensor([[10, 30], [15, 9]])
+    result = box_size_ratio(wh1, wh2)
+    assert result.shape == (4, 2)
+    assert result[0, 0] == 30 / 11
+    assert result[0, 1] == 24 / 15
+    assert result[1, 0] == 12 / 10
+    assert result[1, 1] == 25 / 9
+    assert result[2, 0] == 26 / 10
+    assert result[2, 1] == 27 / 9
+    assert result[3, 0] == 30 / 17
+    assert result[3, 1] == 17 / 9

From c5092855e9fc8f80d2dc8d719d27be86953d4850 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 5 Apr 2023 13:00:50 +0300
Subject: [PATCH 66/76] Speeded up YOLO unit tests (NMS) considerably by using
 a higher confidence threshold

---
 tests/models/test_detection.py | 52 +++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index b9e0926846..ba65ac262e 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -95,7 +95,7 @@ def test_fasterrcnn_pyt_module_bbone_train(tmpdir):
 def test_darknet(config, catch_warnings):
     config_path = Path(TEST_ROOT) / "data" / f"{config}.cfg"
     network = DarknetNetwork(config_path)
-    model = YOLO(network)
+    model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
     detections = model.infer(image)
@@ -120,7 +120,7 @@ def test_darknet_train(tmpdir, cfg_name, catch_warnings):
 
     config_path = Path(TEST_ROOT) / "data" / f"{cfg_name}.cfg"
     network = DarknetNetwork(config_path)
-    model = YOLO(network)
+    model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -130,8 +130,8 @@ def test_darknet_train(tmpdir, cfg_name, catch_warnings):
 
 
 def test_yolov4_tiny(catch_warnings):
-    network = YOLOV4TinyNetwork(num_classes=2, width=4, overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV4TinyNetwork(num_classes=2, width=4)
+    model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
     detections = model.infer(image)
@@ -147,8 +147,8 @@ def test_yolov4_tiny_train(tmpdir):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4TinyNetwork(num_classes=2, width=4, overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV4TinyNetwork(num_classes=2, width=4)
+    model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -158,8 +158,8 @@ def test_yolov4_tiny_train(tmpdir):
 
 
 def test_yolov4(catch_warnings):
-    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
     detections = model.infer(image)
@@ -175,8 +175,8 @@ def test_yolov4_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -186,8 +186,8 @@ def test_yolov4_train(tmpdir, catch_warnings):
 
 
 def test_yolov4p6(catch_warnings):
-    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128), overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
+    model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
     detections = model.infer(image)
@@ -203,8 +203,8 @@ def test_yolov4p6_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128), overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
+    model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -214,8 +214,8 @@ def test_yolov4p6_train(tmpdir, catch_warnings):
 
 
 def test_yolov5(catch_warnings):
-    network = YOLOV5Network(num_classes=2, depth=1, width=4, overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV5Network(num_classes=2, depth=1, width=4)
+    model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
     detections = model.infer(image)
@@ -231,8 +231,8 @@ def test_yolov5_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV5Network(num_classes=2, depth=1, width=4, overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV5Network(num_classes=2, depth=1, width=4)
+    model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -242,8 +242,8 @@ def test_yolov5_train(tmpdir, catch_warnings):
 
 
 def test_yolov7(catch_warnings):
-    network = YOLOV7Network(num_classes=2, depth=1, width=4, overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV7Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
     detections = model.infer(image)
@@ -259,8 +259,8 @@ def test_yolov7_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV7Network(num_classes=2, depth=1, width=4, overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOV7Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -270,8 +270,8 @@ def test_yolov7_train(tmpdir, catch_warnings):
 
 
 def test_yolox(catch_warnings):
-    network = YOLOXNetwork(num_classes=2, depth=1, width=4, overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOXNetwork(num_classes=2, depth=1, width=4)
+    model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
     detections = model.infer(image)
@@ -287,8 +287,8 @@ def test_yolox_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOXNetwork(num_classes=2, depth=1, width=4, overlap_func="giou")
-    model = YOLO(network)
+    network = YOLOXNetwork(num_classes=2, depth=1, width=4)
+    model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
     valid_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)

From 614f7e6f69a302356e419bccba8672d1eac84ec3 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 5 Apr 2023 20:30:13 +0300
Subject: [PATCH 67/76] detection_boxes is now called detections in
 MeanAveragePrecision

---
 pl_bolts/models/detection/yolo/yolo_module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index c97d2e75df..4d3532746e 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -298,7 +298,7 @@ def validation_step(self, batch: BATCH, batch_idx: int) -> Optional[STEP_OUTPUT]
     def on_validation_epoch_end(self) -> None:
         # When continuing training from a checkpoint, it may happen that epoch_end is called without detections. In this
         # case the metrics cannot be computed.
-        if (not _MEAN_AVERAGE_PRECISION_AVAILABLE) or (not self._val_map.detection_boxes):
+        if (not _MEAN_AVERAGE_PRECISION_AVAILABLE) or (not self._val_map.detections):
             return
 
         map_scores = self._val_map.compute()
@@ -330,7 +330,7 @@ def test_step(self, batch: BATCH, batch_idx: int) -> Optional[STEP_OUTPUT]:
     def on_test_epoch_end(self) -> None:
         # When continuing training from a checkpoint, it may happen that epoch_end is called without detections. In this
         # case the metrics cannot be computed.
-        if (not _MEAN_AVERAGE_PRECISION_AVAILABLE) or (not self._test_map.detection_boxes):
+        if (not _MEAN_AVERAGE_PRECISION_AVAILABLE) or (not self._test_map.detections):
             return
 
         map_scores = self._test_map.compute()

From 224273668184f0000b5cb5d8a24f2146d5e020ee Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 6 Apr 2023 05:44:12 +0300
Subject: [PATCH 68/76] Use giou in YOLO tests to allow them to pass also with
 older versions of Torchvision

---
 tests/models/test_detection.py | 36 ++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index ba65ac262e..c9c748b5ce 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -130,7 +130,8 @@ def test_darknet_train(tmpdir, cfg_name, catch_warnings):
 
 
 def test_yolov4_tiny(catch_warnings):
-    network = YOLOV4TinyNetwork(num_classes=2, width=4)
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV4TinyNetwork(num_classes=2, width=4, overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
@@ -147,7 +148,8 @@ def test_yolov4_tiny_train(tmpdir):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4TinyNetwork(num_classes=2, width=4)
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV4TinyNetwork(num_classes=2, width=4, overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -158,7 +160,8 @@ def test_yolov4_tiny_train(tmpdir):
 
 
 def test_yolov4(catch_warnings):
-    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
@@ -175,7 +178,8 @@ def test_yolov4_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV4Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -186,7 +190,8 @@ def test_yolov4_train(tmpdir, catch_warnings):
 
 
 def test_yolov4p6(catch_warnings):
-    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128), overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
@@ -203,7 +208,8 @@ def test_yolov4p6_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128))
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV4P6Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128, 128), overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -214,7 +220,8 @@ def test_yolov4p6_train(tmpdir, catch_warnings):
 
 
 def test_yolov5(catch_warnings):
-    network = YOLOV5Network(num_classes=2, depth=1, width=4)
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV5Network(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
@@ -231,7 +238,8 @@ def test_yolov5_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV5Network(num_classes=2, depth=1, width=4)
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV5Network(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -242,7 +250,8 @@ def test_yolov5_train(tmpdir, catch_warnings):
 
 
 def test_yolov7(catch_warnings):
-    network = YOLOV7Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV7Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
@@ -259,7 +268,8 @@ def test_yolov7_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOV7Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128))
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOV7Network(num_classes=2, widths=(4, 8, 16, 32, 64, 128), overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)
@@ -270,7 +280,8 @@ def test_yolov7_train(tmpdir, catch_warnings):
 
 
 def test_yolox(catch_warnings):
-    network = YOLOXNetwork(num_classes=2, depth=1, width=4)
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOXNetwork(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     image = torch.rand(3, 256, 256)
@@ -287,7 +298,8 @@ def test_yolox_train(tmpdir, catch_warnings):
         category=PossibleUserWarning,
     )
 
-    network = YOLOXNetwork(num_classes=2, depth=1, width=4)
+    # Using giou allows the tests to pass also with older versions of Torchvision.
+    network = YOLOXNetwork(num_classes=2, depth=1, width=4, overlap_func="giou")
     model = YOLO(network, confidence_threshold=0.5)
 
     train_dl = DataLoader(DummyDetectionDataset(num_classes=2), collate_fn=_collate_fn)

From 4807f3c742c0444b795705c81bb12cf7d065c834 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 6 Apr 2023 06:31:42 +0300
Subject: [PATCH 69/76] Use double underscores in links in the docstring to
 avoid duplicate names

---
 pl_bolts/models/detection/yolo/yolo_module.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index 4d3532746e..a282870116 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -89,18 +89,18 @@ class YOLO(LightningModule):
     """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4, YOLOv5,
     YOLOv7, Scaled-YOLOv4, and YOLOX.
 
-    *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`_
+    *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`__
 
-    *YOLOv4 paper*: `Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2004.10934>`_
+    *YOLOv4 paper*: `Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2004.10934>`__
 
-    *YOLOv7 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2207.02696>`_
+    *YOLOv7 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2207.02696>`__
 
     *Scaled-YOLOv4 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao
-    <https://arxiv.org/abs/2011.08036>`_
+    <https://arxiv.org/abs/2011.08036>`__
 
-    *YOLOX paper*: `Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun <https://arxiv.org/abs/2107.08430>`_
+    *YOLOX paper*: `Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun <https://arxiv.org/abs/2107.08430>`__
 
-    *Implementation*: `Seppo Enarvi <https://github.com/senarvi>`_
+    *Implementation*: `Seppo Enarvi <https://github.com/senarvi>`__
 
     The network architecture can be written in PyTorch, or read from a Darknet configuration file using the
     :class:`~.darknet_network.DarknetNetwork` class. ``DarknetNetwork`` is also able to read weights that have been

From 64a3e47d71758bf350ca4922278ab867c2af92c3 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 6 Apr 2023 07:53:58 +0300
Subject: [PATCH 70/76] Check that targets are given in training mode

---
 pl_bolts/models/detection/yolo/yolo_module.py | 94 ++++++++++---------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index a282870116..af3b5e2197 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -42,49 +42,6 @@
     warn_missing_pkg("torchvision")
 
 
-def validate_batch(images: Union[Tensor, IMAGES], targets: Optional[TARGETS]) -> None:
-    """Validates the format of a batch of data.
-
-    Args:
-        images: A tensor containing a batch of images or a list of image tensors.
-        targets: A list of target dictionaries or ``None``. If a list is provided, there should be as many target
-            dictionaries as there are images.
-    """
-    if not isinstance(images, Tensor):
-        if not isinstance(images, (tuple, list)):
-            raise TypeError(f"Expected images to be a Tensor, tuple, or a list, got {type(images).__name__}.")
-        if not images:
-            raise ValueError("No images in batch.")
-        shape = images[0].shape
-        for image in images:
-            if not isinstance(image, Tensor):
-                raise ValueError(f"Expected image to be of type Tensor, got {type(image).__name__}.")
-            if image.shape != shape:
-                raise ValueError(f"Images with different shapes in one batch: {shape} and {image.shape}")
-
-    if targets is None:
-        return
-
-    if not isinstance(targets, (tuple, list)):
-        raise TypeError(f"Expected targets to be a tuple or a list, got {type(images).__name__}.")
-    if len(images) != len(targets):
-        raise ValueError(f"Got {len(images)} images, but targets for {len(targets)} images.")
-
-    for target in targets:
-        boxes = target["boxes"]
-        if not isinstance(boxes, Tensor):
-            raise ValueError(f"Expected target boxes to be of type Tensor, got {type(boxes).__name__}.")
-        if (boxes.ndim != 2) or (boxes.shape[-1] != 4):
-            raise ValueError(f"Expected target boxes to be tensors of shape [N, 4], got {list(boxes.shape)}.")
-        labels = target["labels"]
-        if not isinstance(labels, Tensor):
-            raise ValueError(f"Expected target labels to be of type Tensor, got {type(labels).__name__}.")
-        if (labels.ndim < 1) or (labels.ndim > 2) or (len(labels) != len(boxes)):
-            raise ValueError(
-                f"Expected target labels to be tensors of shape [N] or [N, num_classes], got {list(labels.shape)}."
-            )
-
-
 class YOLO(LightningModule):
     """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4, YOLOv5,
     YOLOv7, Scaled-YOLOv4, and YOLOX.
@@ -205,7 +162,7 @@ def forward(
             ``anchors`` is the feature map size (width * height) times the number of anchors per cell. The predicted box
             coordinates are in `(x1, y1, x2, y2)` format and scaled to the input image size.
         """
-        validate_batch(images, targets)
+        self.validate_batch(images, targets)
         images_tensor = images if isinstance(images, Tensor) else torch.stack(images)
         detections, losses, hits = self.network(images_tensor, targets)
 
@@ -444,6 +401,55 @@ def process(boxes: Tensor, labels: Tensor, **other: Any) -> Dict[str, Any]:
 
         return [process(**t) for t in targets]
 
+    def validate_batch(self, images: Union[Tensor, IMAGES], targets: Optional[TARGETS]) -> None:
+        """Validates the format of a batch of data.
+
+        Args:
+            images: A tensor containing a batch of images or a list of image tensors.
+            targets: A list of target dictionaries or ``None``. If a list is provided, there should be as many target
+                dictionaries as there are images.
+        """
+        if not isinstance(images, Tensor):
+            if not isinstance(images, (tuple, list)):
+                raise TypeError(f"Expected images to be a Tensor, tuple, or a list, got {type(images).__name__}.")
+            if not images:
+                raise ValueError("No images in batch.")
+            shape = images[0].shape
+            for image in images:
+                if not isinstance(image, Tensor):
+                    raise ValueError(f"Expected image to be of type Tensor, got {type(image).__name__}.")
+                if image.shape != shape:
+                    raise ValueError(f"Images with different shapes in one batch: {shape} and {image.shape}")
+
+        if targets is None:
+            if self.training:
+                raise ValueError("Targets should be given in training mode.")
+            else:
+                return
+
+        if not isinstance(targets, (tuple, list)):
+            raise TypeError(f"Expected targets to be a tuple or a list, got {type(images).__name__}.")
+        if len(images) != len(targets):
+            raise ValueError(f"Got {len(images)} images, but targets for {len(targets)} images.")
+
+        for target in targets:
+            if "boxes" not in target:
+                raise ValueError("Target dictionary doesn't contain boxes.")
+            boxes = target["boxes"]
+            if not isinstance(boxes, Tensor):
+                raise TypeError(f"Expected target boxes to be of type Tensor, got {type(boxes).__name__}.")
+            if (boxes.ndim != 2) or (boxes.shape[-1] != 4):
+                raise ValueError(f"Expected target boxes to be tensors of shape [N, 4], got {list(boxes.shape)}.")
+            if "labels" not in target:
+                raise ValueError("Target dictionary doesn't contain labels.")
+            labels = target["labels"]
+            if not isinstance(labels, Tensor):
+                raise ValueError(f"Expected target labels to be of type Tensor, got {type(labels).__name__}.")
+            if (labels.ndim < 1) or (labels.ndim > 2) or (len(labels) != len(boxes)):
+                raise ValueError(
+                    f"Expected target labels to be tensors of shape [N] or [N, num_classes], got {list(labels.shape)}."
+                )
+
 
 class CLIYOLO(YOLO):
     """A subclass of YOLO that can be easily configured using LightningCLI.

From 26c9a7c2e861f1710450c872b1f25d64dd10a0f7 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Sat, 8 Apr 2023 11:27:21 +0300
Subject: [PATCH 71/76] Fixed docstring formatting

---
 pl_bolts/models/detection/yolo/yolo_module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pl_bolts/models/detection/yolo/yolo_module.py b/pl_bolts/models/detection/yolo/yolo_module.py
index af3b5e2197..604944508e 100644
--- a/pl_bolts/models/detection/yolo/yolo_module.py
+++ b/pl_bolts/models/detection/yolo/yolo_module.py
@@ -43,8 +43,8 @@
 
 
 class YOLO(LightningModule):
-    """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4, YOLOv5,
-    YOLOv7, Scaled-YOLOv4, and YOLOX.
+    """PyTorch Lightning implementation of YOLO that supports the most important features of YOLOv3, YOLOv4,
+    YOLOv5, YOLOv7, Scaled-YOLOv4, and YOLOX.
 
     *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`__
 

From 98e4c2e7e2c832a73ceb890b921fd09b1b69ff63 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Tue, 23 May 2023 11:37:01 +0300
Subject: [PATCH 72/76] Add

---
 tests/models/yolo/unit/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/yolo/unit/test_utils.py b/tests/models/yolo/unit/test_utils.py
index e8e1d02ad1..ad355f43fe 100644
--- a/tests/models/yolo/unit/test_utils.py
+++ b/tests/models/yolo/unit/test_utils.py
@@ -53,7 +53,7 @@ def test_global_xy(catch_warnings):
 
 
 @pytest.mark.parametrize(
-    "dims1, dims2, expected_ious",
+    ("dims1", "dims2", "expected_ious"),
     [
         (
             torch.tensor([[1.0, 1.0], [10.0, 1.0], [100.0, 10.0]]),

From bf6295bbff1a3ee9e931b73759aad627ab2e8043 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Tue, 23 May 2023 14:55:03 +0300
Subject: [PATCH 73/76] Removed files that popped up back in the merge

---
 tests/models/yolo/unit/test_yolo_config.py | 119 ---------------------
 tests/models/yolo/unit/test_yolo_layers.py |  51 ---------
 2 files changed, 170 deletions(-)
 delete mode 100644 tests/models/yolo/unit/test_yolo_config.py
 delete mode 100644 tests/models/yolo/unit/test_yolo_layers.py

diff --git a/tests/models/yolo/unit/test_yolo_config.py b/tests/models/yolo/unit/test_yolo_config.py
deleted file mode 100644
index fb9b1bc811..0000000000
--- a/tests/models/yolo/unit/test_yolo_config.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import warnings
-
-import pytest
-from pytorch_lightning.utilities.warnings import PossibleUserWarning
-
-from pl_bolts.models.detection.yolo.yolo_config import (
-    _create_convolutional,
-    _create_maxpool,
-    _create_shortcut,
-    _create_upsample,
-)
-
-
-@pytest.mark.parametrize(
-    "config",
-    [
-        ({"batch_normalize": 1, "filters": 8, "size": 3, "stride": 1, "pad": 1, "activation": "leaky"}),
-        ({"batch_normalize": 0, "filters": 2, "size": 1, "stride": 1, "pad": 1, "activation": "mish"}),
-        ({"batch_normalize": 1, "filters": 6, "size": 3, "stride": 2, "pad": 1, "activation": "logistic"}),
-        ({"batch_normalize": 0, "filters": 4, "size": 3, "stride": 2, "pad": 0, "activation": "linear"}),
-    ],
-)
-def test_create_convolutional(config, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    conv, _ = _create_convolutional(config, [3])
-
-    assert conv.conv.out_channels == config["filters"]
-    assert conv.conv.kernel_size == (config["size"], config["size"])
-    assert conv.conv.stride == (config["stride"], config["stride"])
-
-    activation = config["activation"]
-    pad_size = (config["size"] - 1) // 2 if config["pad"] else 0
-
-    if config["pad"]:
-        assert conv.conv.padding == (pad_size, pad_size)
-
-    if config["batch_normalize"]:
-        assert len(conv) == 3
-
-    if activation != "linear":
-        if activation != "logistic":
-            assert activation == conv[-1].__class__.__name__.lower()[: len(activation)]
-        elif activation == "logistic":
-            assert conv[-1].__class__.__name__.lower() == "sigmoid"
-
-
-@pytest.mark.parametrize(
-    "config",
-    [
-        (
-            {
-                "size": 2,
-                "stride": 2,
-            }
-        ),
-        (
-            {
-                "size": 6,
-                "stride": 3,
-            }
-        ),
-    ],
-)
-def test_create_maxpool(config, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    pad_size = (config["size"] - 1) // 2
-    maxpool, _ = _create_maxpool(config, [3])
-
-    assert maxpool.kernel_size == config["size"]
-    assert maxpool.stride == config["stride"]
-    assert maxpool.padding == pad_size
-
-
-@pytest.mark.parametrize(
-    "config",
-    [
-        ({"from": 1, "activation": "linear"}),
-        ({"from": 3, "activation": "linear"}),
-    ],
-)
-def test_create_shortcut(config, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    shortcut, _ = _create_shortcut(config, [3])
-
-    assert shortcut.source_layer == config["from"]
-
-
-@pytest.mark.parametrize(
-    "config",
-    [
-        ({"stride": 2}),
-        ({"stride": 4}),
-    ],
-)
-def test_create_upsample(config, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    upsample, _ = _create_upsample(config, [3])
-
-    assert upsample.scale_factor == float(config["stride"])
diff --git a/tests/models/yolo/unit/test_yolo_layers.py b/tests/models/yolo/unit/test_yolo_layers.py
deleted file mode 100644
index 70e9b32e53..0000000000
--- a/tests/models/yolo/unit/test_yolo_layers.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import warnings
-
-import pytest
-import torch
-from pytorch_lightning.utilities.warnings import PossibleUserWarning
-
-from pl_bolts.models.detection.yolo.yolo_layers import GIoULoss, IoULoss, SELoss, _corner_coordinates
-
-
-@pytest.mark.parametrize(
-    ("xy", "wh", "expected"),
-    [
-        ([0.0, 0.0], [1.0, 1.0], [-0.5, -0.5, 0.5, 0.5]),
-        ([5.0, 5.0], [2.0, 2.0], [4.0, 4.0, 6.0, 6.0]),
-    ],
-)
-def test_corner_coordinates(xy, wh, expected, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    xy = torch.tensor(xy)
-    wh = torch.tensor(wh)
-    corners = _corner_coordinates(xy, wh)
-    assert torch.allclose(corners, torch.tensor(expected))
-
-
-@pytest.mark.parametrize(
-    ("loss_func", "bbox1", "bbox2", "expected"),
-    [
-        (GIoULoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 1.4144532680511475),
-        (IoULoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 1.0),
-        (SELoss, [[0.0, 0.0, 120.0, 200.0]], [[189.0, 93.0, 242.0, 215.0]], 59479.0),
-    ],
-)
-def test_loss_functions(loss_func, bbox1, bbox2, expected, catch_warnings):
-    warnings.filterwarnings(
-        "ignore",
-        message=".*does not have many workers which may be a bottleneck.*",
-        category=PossibleUserWarning,
-    )
-
-    loss_func = loss_func()
-    tensor1 = torch.tensor(bbox1, dtype=torch.float32)
-    tensor2 = torch.tensor(bbox2, dtype=torch.float32)
-
-    loss = loss_func(tensor1, tensor2)
-    assert loss.item() > 0.0
-    assert loss.item() == expected

From 904281247e1152199a886379dc40ee835ea95fab Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 24 May 2023 06:06:51 +0300
Subject: [PATCH 74/76] Code formatting fixed by ruff

---
 src/pl_bolts/models/detection/yolo/layers.py         |  5 +----
 src/pl_bolts/models/detection/yolo/loss.py           | 10 ++--------
 src/pl_bolts/models/detection/yolo/torch_networks.py |  5 +----
 tests/models/yolo/unit/test_utils.py                 |  4 ++--
 4 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/src/pl_bolts/models/detection/yolo/layers.py b/src/pl_bolts/models/detection/yolo/layers.py
index d401fa3a1b..1be1fe10e1 100644
--- a/src/pl_bolts/models/detection/yolo/layers.py
+++ b/src/pl_bolts/models/detection/yolo/layers.py
@@ -137,10 +137,7 @@ def forward(self, x: Tensor, image_size: Tensor) -> Tuple[Tensor, PREDS]:
 
         image_xy = global_xy(xy, image_size)
         prior_shapes = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
-        if self.input_is_normalized:
-            image_wh = 4 * torch.square(wh) * prior_shapes
-        else:
-            image_wh = torch.exp(wh) * prior_shapes
+        image_wh = 4 * torch.square(wh) * prior_shapes if self.input_is_normalized else torch.exp(wh) * prior_shapes
         box = torch.cat((image_xy, image_wh), -1)
         box = box_convert(box, in_fmt="cxcywh", out_fmt="xyxy")
         output = torch.cat((box, norm_confidence.unsqueeze(-1), norm_classprob), -1)
diff --git a/src/pl_bolts/models/detection/yolo/loss.py b/src/pl_bolts/models/detection/yolo/loss.py
index ded7143dc0..d91b443466 100644
--- a/src/pl_bolts/models/detection/yolo/loss.py
+++ b/src/pl_bolts/models/detection/yolo/loss.py
@@ -288,10 +288,7 @@ def pairwise(
         """
         loss_shape = torch.Size([len(preds["boxes"]), len(targets["boxes"])])
 
-        if input_is_normalized:
-            bce_func = binary_cross_entropy
-        else:
-            bce_func = binary_cross_entropy_with_logits
+        bce_func = binary_cross_entropy if input_is_normalized else binary_cross_entropy_with_logits
 
         overlap = self._pairwise_overlap(preds["boxes"], targets["boxes"])
         assert overlap.shape == loss_shape
@@ -338,10 +335,7 @@ def elementwise_sums(
         Returns:
             The final losses.
         """
-        if input_is_normalized:
-            bce_func = binary_cross_entropy
-        else:
-            bce_func = binary_cross_entropy_with_logits
+        bce_func = binary_cross_entropy if input_is_normalized else binary_cross_entropy_with_logits
 
         overlap_loss = self._elementwise_overlap_loss(targets["boxes"], preds["boxes"])
         overlap = 1.0 - overlap_loss
diff --git a/src/pl_bolts/models/detection/yolo/torch_networks.py b/src/pl_bolts/models/detection/yolo/torch_networks.py
index 9b18a1ddcb..480df94974 100644
--- a/src/pl_bolts/models/detection/yolo/torch_networks.py
+++ b/src/pl_bolts/models/detection/yolo/torch_networks.py
@@ -405,10 +405,7 @@ def maxpool(out_channels: int) -> nn.Module:
             )
 
         def stage(out_channels: int, use_maxpool: bool) -> nn.Module:
-            if use_maxpool:
-                downsample_module = maxpool(out_channels)
-            else:
-                downsample_module = downsample(out_channels // 2, out_channels)
+            downsample_module = maxpool(out_channels) if use_maxpool else downsample(out_channels // 2, out_channels)
             stage_module = TinyStage(out_channels, activation=activation, norm=normalization)
             return nn.Sequential(OrderedDict([("downsample", downsample_module), ("stage", stage_module)]))
 
diff --git a/tests/models/yolo/unit/test_utils.py b/tests/models/yolo/unit/test_utils.py
index ad355f43fe..75dc5eac5c 100644
--- a/tests/models/yolo/unit/test_utils.py
+++ b/tests/models/yolo/unit/test_utils.py
@@ -15,7 +15,7 @@
 )
 
 
-@pytest.mark.parametrize("width,height", [(10, 5)])
+@pytest.mark.parametrize(("width", "height"), [(10, 5)])
 def test_grid_offsets(width: int, height: int, catch_warnings):
     size = torch.tensor([width, height])
     offsets = grid_offsets(size)
@@ -26,7 +26,7 @@ def test_grid_offsets(width: int, height: int, catch_warnings):
     assert torch.equal(offsets[:, 0, 1], torch.arange(height, dtype=offsets.dtype))
 
 
-@pytest.mark.parametrize("width,height", [(10, 5)])
+@pytest.mark.parametrize(("width", "height"), [(10, 5)])
 def test_grid_centers(width: int, height: int, catch_warnings):
     size = torch.tensor([width, height])
     centers = grid_centers(size)

From a399bed2267158d6d1089ee7d8e83178adbab3cf Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Wed, 24 May 2023 08:25:42 +0300
Subject: [PATCH 75/76] Work around a problem with mypy and if-else ternary
 operator

---
 src/pl_bolts/models/detection/yolo/loss.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/pl_bolts/models/detection/yolo/loss.py b/src/pl_bolts/models/detection/yolo/loss.py
index d91b443466..ded7143dc0 100644
--- a/src/pl_bolts/models/detection/yolo/loss.py
+++ b/src/pl_bolts/models/detection/yolo/loss.py
@@ -288,7 +288,10 @@ def pairwise(
         """
         loss_shape = torch.Size([len(preds["boxes"]), len(targets["boxes"])])
 
-        bce_func = binary_cross_entropy if input_is_normalized else binary_cross_entropy_with_logits
+        if input_is_normalized:
+            bce_func = binary_cross_entropy
+        else:
+            bce_func = binary_cross_entropy_with_logits
 
         overlap = self._pairwise_overlap(preds["boxes"], targets["boxes"])
         assert overlap.shape == loss_shape
@@ -335,7 +338,10 @@ def elementwise_sums(
         Returns:
             The final losses.
         """
-        bce_func = binary_cross_entropy if input_is_normalized else binary_cross_entropy_with_logits
+        if input_is_normalized:
+            bce_func = binary_cross_entropy
+        else:
+            bce_func = binary_cross_entropy_with_logits
 
         overlap_loss = self._elementwise_overlap_loss(targets["boxes"], preds["boxes"])
         overlap = 1.0 - overlap_loss

From c97614780be8b2a1ad628d3e95ebf484640827a4 Mon Sep 17 00:00:00 2001
From: Seppo Enarvi <seppo.git@marjaniemi.com>
Date: Thu, 25 May 2023 21:33:00 +0300
Subject: [PATCH 76/76] Fixed bcefunc assignment so that both ruff and mypy are
 happy

mypy doesn't understand "= ... if ... else ..." and ruff doesn't accept the longer form.
---
 src/pl_bolts/models/detection/yolo/loss.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/pl_bolts/models/detection/yolo/loss.py b/src/pl_bolts/models/detection/yolo/loss.py
index ded7143dc0..1152441aad 100644
--- a/src/pl_bolts/models/detection/yolo/loss.py
+++ b/src/pl_bolts/models/detection/yolo/loss.py
@@ -288,10 +288,9 @@ def pairwise(
         """
         loss_shape = torch.Size([len(preds["boxes"]), len(targets["boxes"])])
 
-        if input_is_normalized:
-            bce_func = binary_cross_entropy
-        else:
-            bce_func = binary_cross_entropy_with_logits
+        bce_func: Callable[..., Tensor] = (
+            binary_cross_entropy if input_is_normalized else binary_cross_entropy_with_logits  # type: ignore
+        )
 
         overlap = self._pairwise_overlap(preds["boxes"], targets["boxes"])
         assert overlap.shape == loss_shape
@@ -338,10 +337,9 @@ def elementwise_sums(
         Returns:
             The final losses.
         """
-        if input_is_normalized:
-            bce_func = binary_cross_entropy
-        else:
-            bce_func = binary_cross_entropy_with_logits
+        bce_func: Callable[..., Tensor] = (
+            binary_cross_entropy if input_is_normalized else binary_cross_entropy_with_logits  # type: ignore
+        )
 
         overlap_loss = self._elementwise_overlap_loss(targets["boxes"], preds["boxes"])
         overlap = 1.0 - overlap_loss