Source code for micromind.utils.yolo

"""
Helper functions.

Authors:
    - Matteo Beltrami, 2023
    - Francesco Paissan, 2023
"""
import time
import types
from collections import defaultdict
from pathlib import Path

import cv2
import numpy as np
import torch
import torchvision
import yaml


[docs] def get_variant_multiples(variant): tmp = { "n": (0.33, 0.25, 2.0), "s": (0.33, 0.50, 2.0), "m": (0.67, 0.75, 1.5), "l": (1.0, 1.0, 1.0), "x": (1, 1.25, 1.0), }.get(variant, None) return tmp[1], tmp[2], tmp[0]
[docs] def load_config(file_path): """ Load configuration from a YAML file and preprocess it for training. Arguments --------- file_path : str Path to the YAML configuration file. Returns ------- m_cfg : types.SimpleNamespace Model configuration containing task-specific parameters. data_cfg : dict Data configuration containing paths and settings for train, val and test. """ with open(file_path, "r") as file: config = yaml.safe_load(file) path = Path(Path.cwd() / config["path"]).resolve() if "train" in config: if not isinstance(config["train"], list): train = Path(path / config["train"]) else: train = [Path(path / p) for p in config["train"]] else: train = None if "val" in config: if not isinstance(config["val"], list): val = Path(path / config["val"]) else: val = [Path(path / p) for p in config["val"]] else: val = None # val = Path(path / config["val"]) if "val" in config else None if ("test" not in config) or (config["test"] is None): test = None else: test = Path(path / config["test"]) data_cfg = { "path": path, "train": train, "val": val, "test": test, "names": config["names"], "download": config.get("download"), "yaml_file": file_path, "nc": len(config["names"]), } m_cfg = { "task", "mode", "imgsz", "rect", "cache", "single_cls", "fraction", "overlap_mask", "mask_ratio", "classes", "box", "cls", "dfl", "hsv_h", "hsv_s", "hsv_v", "degrees", "translate", "scale", "shear", "perspective", "flipud", "fliplr", "mosaic", "mixup", "copy_paste", } m_cfg = {key: config[key] for key in m_cfg if key in config} m_cfg = types.SimpleNamespace(**m_cfg) return m_cfg, data_cfg
[docs] def autopad(k, p=None, d=1): # kernel, padding, dilation """Calculate padding value for a convolution operation based on kernel size and dilation. This function computes the padding value for a convolution operation to maintain the spatial size of the input tensor. Arguments --------- k : int Kernel size for the convolution operation. If a single integer is provided, it's assumed that all dimensions have the same kernel size. p : int, optional Padding value for the convolution operation. If not provided, it will be calculated to maintain the spatial size of the input tensor. d : int, optional Dilation for the convolution operation. Default is 1. Returns ------- The padding value to maintain the spatial size of the input tensor : int """ if d > 1: k = ( d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] ) # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p
[docs] def make_anchors(feats, strides, grid_cell_offset=0.5): """Generate anchor points and stride tensors. This function generates anchor points for each feature map and stride combination. It is commonly used in object detection tasks to define anchor boxes. Arguments --------- feats : torch.Tensor A feature map (tensor) from which anchor points will be generated. strides : torch.Tensor Stride values corresponding to each feature map. Strides define the spacing between anchor points. grid_cell_offset : float, optional Offset to be added to the grid cell coordinates when generating anchor points. Default is 0.5. Returns ------- anchor_points : torch.Tensor Concatenated anchor points for all feature maps as a 2D tensor. stride_tensor : torch.Tensor Concatenated stride values for all anchor points as a 2D tensor. """ anchor_points, stride_tensor = [], [] assert feats is not None dtype, device = feats[0].dtype, feats[0].device for i, stride in enumerate(strides): _, _, h, w = feats[i].shape sx = ( torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset ) # shift x sy = ( torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset ) # shift y sy, sx = torch.meshgrid(sy, sx, indexing="ij") anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2)) stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) return torch.cat(anchor_points), torch.cat(stride_tensor)
[docs] def dist2bbox(distance, anchor_points, xywh=True, dim=-1): """Convert distance predictions to bounding box coordinates. This function takes distance predictions and anchor points to calculate bounding box coordinates. Arguments --------- distance : torch.Tensor Tensor containing distance predictions. It should be in the format [lt, rb] if `xywh` is True, or [x1y1, x2y2] if `xywh` is False. anchor_points : torch.Tensor Tensor containing anchor points used for the conversion. xywh : bool, optional If True, the function returns bounding boxes in the format [center_x, center_y, width, height]. If False, it returns bounding boxes in the format [x1, y1, x2, y2]. Default is True. dim : int, optional The dimension along which the tensor is split into lt and rb. Default is -1. Returns ------- Converted bounding box coordinates in the specified format : torch.Tensors """ lt, rb = torch.chunk(distance, chunks=2, dim=dim) x1y1 = anchor_points - lt x2y2 = anchor_points + rb if xywh: c_xy = (x1y1 + x2y2) / 2 wh = x2y2 - x1y1 return torch.cat((c_xy, wh), dim=1) return torch.cat((x1y1, x2y2), dim=1)
[docs] def compute_transform( image, new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, stride=32 ): """Compute a transformation of an image to the specified size and format. This function computes a transformation of the input image to the specified new size and format, while optionally maintaining the aspect ratio or adding padding as needed. Arguments --------- image : torch.Tensor The input image to be transformed. new_shape : int or tuple, optional The target size of the transformed image. If an integer is provided, the image is resized to have the same width and height. If a tuple of two integers is provided, it represents the new width and height. Default is (640, 640). auto : bool, optional If True, automatically calculates padding to ensure the output size is divisible by the specified `stride`. Default is False. scaleFill : bool, optional If True, scales the image to completely fill the target size without maintaining the aspect ratio. Default is False. scaleup : bool, optional If True, allows the image to be scaled up (enlarged) if necessary. Default is True. stride : int, optional The stride value used for padding calculation when `auto` is True. Default is 32. Returns ------- The transformed image : numpy.ndarray """ shape = image.shape[-2:] # current shape [height, width] new_shape = (new_shape, new_shape) if isinstance(new_shape, int) else new_shape r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) r = min(r, 1.0) if not scaleup else r new_unpad = (int(round(shape[1] * r)), int(round(shape[0] * r))) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] dw, dh = (dw % stride, dh % stride) if auto else (0.0, 0.0) new_unpad = (new_shape[1], new_shape[0]) if scaleFill else new_unpad new_unpad = (new_unpad[1], new_unpad[0]) dw /= 2 dh /= 2 image = torch.nn.functional.interpolate( image.unsqueeze(0), size=new_unpad, mode="bilinear", align_corners=False ).squeeze(0) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) image = torch.nn.functional.pad(image, (left, right, top, bottom), value=114) return image
[docs] def preprocess(im, imgsz=640, model_stride=32, model_pt=True): """Preprocess a batch of images for inference. This function preprocesses a batch of images for inference by resizing, transforming, and normalizing them. Arguments --------- im : torch.Tensor or list of torch.Tensor An input image or a batch of images to be preprocessed. imgsz : int, optional The target size of the images after preprocessing. Default is 640. model_stride : int, optional The stride value used for padding calculation when `auto` is True in `compute_transform`. Default is 32. model_pt : bool, optional If True, the function automatically calculates the padding to maintain the same shapes for all input images in the batch. Default is True. Returns ------- torch.Tensor The preprocessed batch of images as a torch.Tensor with shape (n, 3, h, w), where n is the number of images, 3 represents the RGB channels, and h and w are the height and width of the images. """ auto = model_pt im = compute_transform(im, new_shape=imgsz, auto=auto, stride=model_stride) im = im.float() / 255.0 # 0 - 255 to 0.0 - 1.0 im = im.unsqueeze(0) return im
[docs] def box_area(box): """Calculate the area of bounding boxes. This function calculates the area of bounding boxes represented as [x1, y1, x2, y2]. Arguments --------- box : torch.Tensor A tensor containing bounding boxes in the format [x1, y1, x2, y2]. Returns ------- A tensor containing the area of each bounding box : torch.Tensor """ return (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
[docs] def box_iou(box1, box2): """Calculate the Intersection over Union (IoU) between two sets of bounding boxes. This function computes the IoU between two sets of bounding boxes. Arguments --------- box1 : numpy.ndarray The first set of bounding boxes in the format [x1, y1, x2, y2]. box2 : numpy.ndarray The second set of bounding boxes in the format [x1, y1, x2, y2]. Returns ------- numpy.ndarray A 2D numpy array containing the IoU between each pair of bounding boxes in box1 and box2. """ lt = np.maximum(box1[:, None, :2], box2[:, :2]) rb = np.minimum(box1[:, None, 2:], box2[:, 2:]) wh = np.clip(rb - lt, 0, None) inter = wh[:, :, 0] * wh[:, :, 1] area1 = box_area(box1)[:, None] area2 = box_area(box2)[None, :] iou = inter / (area1 + area2 - inter) return iou
[docs] def non_max_suppression( prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, labels=(), max_det=300, nc=0, # number of classes (optional) max_time_img=0.05, max_nms=30000, max_wh=7680, ): """ Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box. Parameters ---------- prediction : torch.Tensor A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes) containing the predicted boxes, classes, and masks. The tensor should be in the format output by a model, such as YOLO. conf_thres : float, optional The confidence threshold below which boxes will be filtered out. Valid values are between 0.0 and 1.0. Default is 0.25. iou_thres : float, optional The IoU threshold below which boxes will be filtered out during NMS. Valid values are between 0.0 and 1.0. Default is 0.45. classes : List[int], optional A list of class indices to consider. If None, all classes will be considered. agnostic : bool, optional If True, the model is agnostic to the number of classes, and all classes will be considered as one. Default is False. multi_label : bool, optional If True, each box may have multiple labels. Default is False. labels : List[List[Union[int, float, torch.Tensor]]], optional A list of lists, where each inner list contains the apriori labels for a given image. The list should be in the format output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2). max_det : int, optional The maximum number of boxes to keep after NMS. Default is 300. nc : int, optional The number of classes output by the model. Any indices after this will be considered masks. Default is 0. max_time_img : float, optional The maximum time (seconds) for processing one image. Default is 0.05. max_nms : int, optional The maximum number of boxes into torchvision.ops.nms(). Default is 30000. max_wh : int, optional The maximum box width and height in pixels. Default is 7680. Returns ------- List[torch.Tensor] A list of length batch_size, where each element is a tensor of shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns (x1, y1, x2, y2, confidence, class, mask1, mask2, ...). """ # Checks assert ( 0 <= conf_thres <= 1 ), f"Invalid Confidence threshold {conf_thres}, valid values are between 0 and 1.0" assert ( 0 <= iou_thres <= 1 ), f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0" if isinstance( prediction, (list, tuple) ): # YOLOv8 model in validation model, output = (inference_out, loss_out) prediction = prediction[0] # select only inference output bs = prediction.shape[0] # batch size nc = nc or (prediction.shape[1] - 4) # number of classes nm = prediction.shape[1] - nc - 4 mi = 4 + nc # mask start index xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates # Settings # min_wh = 2 # (pixels) minimum box width and height time_limit = 0.5 + max_time_img * bs # seconds to quit after multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img) prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84) prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy t = time.time() output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs for xi, x in enumerate(prediction): # image index, image inference # Apply constraints # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height x = x[xc[xi]] # confidence # Cat apriori labels if autolabelling if labels and len(labels[xi]): lb = labels[xi] v = torch.zeros((len(lb), nc + nm + 4), device=x.device) v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls x = torch.cat((x, v), 0) # If none remain process next image if not x.shape[0]: continue # Detections matrix nx6 (xyxy, conf, cls) box, cls, mask = x.split((4, nc, nm), 1) if multi_label: i, j = torch.where(cls > conf_thres) x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1) else: # best class only conf, j = cls.max(1, keepdim=True) x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres] # Filter by class if classes is not None: x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] # Check shape n = x.shape[0] # number of boxes if not n: # no boxes continue if n > max_nms: # excess boxes x = x[ x[:, 4].argsort(descending=True)[:max_nms] ] # sort by confidence and remove excess boxes # Batched NMS c = x[:, 5:6] * (0 if agnostic else max_wh) # classes boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS i = i[:max_det] # limit detections output[xi] = x[i] if (time.time() - t) > time_limit: # LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded') break # time limit exceeded return output
[docs] @torch.no_grad() def postprocess(preds, img, orig_imgs): """Perform post-processing on the predictions. This function applies post-processing to the predictions, including Non-Maximum Suppression (NMS) and scaling of bounding boxes. Arguments --------- preds : list of numpy.ndarray A list of prediction arrays from the object detection model. img : numpy.ndarray The input image on which the predictions were made. orig_imgs : numpy.ndarray or list of numpy.ndarray The original image(s) before any preprocessing. Returns ------- list of numpy.ndarray A list of post-processed prediction arrays, each containing bounding boxes and associated information. """ preds = non_max_suppression( prediction=preds, conf_thres=0.25, iou_thres=0.7, agnostic=False, max_det=300, multi_label=True, ) all_preds = [] for i, pred in enumerate(preds): orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs if isinstance(orig_img, dict): pred[:, :4] = scale_boxes( tuple(img["img"].shape[2:4]), pred[:, :4], orig_img["ori_shape"][i] ) # batch else: pred[:, :4] = scale_boxes( img.shape[2:], pred[:, :4], orig_img.shape[1:] ) # single img all_preds.append(pred) return all_preds
[docs] def draw_bounding_boxes_and_save( orig_img_paths, output_img_paths, all_predictions, class_labels, iou_threshold=0.5 ): """Draw bounding boxes on images based on object detection predictions and save the result. This function draws bounding boxes on images based on object detection predictions and saves the result. It also prints the number of objects detected for each class. Arguments --------- orig_img_paths : list of str A list of file paths to the original input images. output_img_paths : list of str A list of file paths to save the images with bounding boxes. all_predictions : list of list of numpy.ndarray A list of lists of prediction arrays from the object detection model. class_labels : list of str A list of class labels corresponding to the object classes. iou_threshold : float, optional The IoU threshold used for non-maximum suppression to remove overlapping bounding boxes. Default is 0.5. Returns ------- None """ color_dict = { label: tuple( (((i + 1) * 50) % 256, ((i + 1) * 100) % 256, ((i + 1) * 150) % 256) ) for i, label in enumerate(class_labels) } font = cv2.FONT_HERSHEY_SIMPLEX def is_bright_color(color): r, g, b = color brightness = (r * 299 + g * 587 + b * 114) / 1000 return brightness > 127 for img_idx, (orig_img_path, output_img_path, predictions) in enumerate( zip(orig_img_paths, output_img_paths, all_predictions) ): predictions = np.array(predictions) orig_img = cv2.imread(orig_img_path) height, width, _ = orig_img.shape box_thickness = int((height + width) / 400) font_scale = (height + width) / 2500 grouped_preds = defaultdict(list) object_count = defaultdict(int) for pred_np in predictions: grouped_preds[int(pred_np[-1])].append(pred_np) def draw_box_and_label(pred, color): x1, y1, x2, y2, conf, _ = pred x1, y1, x2, y2 = map(int, (x1, y1, x2, y2)) cv2.rectangle(orig_img, (x1, y1), (x2, y2), color, box_thickness) label = f"{class_labels[class_id]} {conf:.2f}" text_size, _ = cv2.getTextSize(label, font, font_scale, 1) label_y, bg_y = ( (y1 - 4, y1 - text_size[1] - 4) if y1 - text_size[1] - 4 > 0 else (y1 + text_size[1], y1) ) cv2.rectangle( orig_img, (x1, bg_y), (x1 + text_size[0], bg_y + text_size[1]), color, -1, ) font_color = (0, 0, 0) if is_bright_color(color) else (255, 255, 255) cv2.putText( orig_img, label, (x1, label_y), font, font_scale, font_color, 1, cv2.LINE_AA, ) for class_id, pred_list in grouped_preds.items(): pred_list = np.array(pred_list) while len(pred_list) > 0: max_conf_idx = np.argmax(pred_list[:, 4]) max_conf_pred = pred_list[max_conf_idx] pred_list = np.delete(pred_list, max_conf_idx, axis=0) color = color_dict[class_labels[class_id]] draw_box_and_label(max_conf_pred, color) object_count[class_labels[class_id]] += 1 iou_scores = box_iou(np.array([max_conf_pred[:4]]), pred_list[:, :4]) low_iou_indices = np.where(iou_scores[0] < iou_threshold)[0] pred_list = pred_list[low_iou_indices] for low_conf_pred in pred_list: draw_box_and_label(low_conf_pred, color) print(f"Image {img_idx + 1}:") print("Objects detected:") for obj, count in object_count.items(): print(f"- {obj}: {count}") cv2.imwrite(output_img_path, orig_img) print(f"saved detections at {output_img_path}")
[docs] def clip_boxes(boxes, shape): """Clip bounding boxes to stay within image boundaries. This function clips bounding boxes to ensure that they stay within the boundaries of the image. Arguments --------- boxes : torch.Tensor A tensor containing bounding boxes in the format [x1, y1, x2, y2]. shape : tuple A tuple representing the shape of the image in the format (height, width). Returns ------- A tensor containing the clipped bounding boxes : torch.Tensor """ boxes[..., [0, 2]] = torch.clip(boxes[..., [0, 2]], 0, shape[1]) # x1, x2 boxes[..., [1, 3]] = torch.clip(boxes[..., [1, 3]], 0, shape[0]) # y1, y2 return boxes
[docs] def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None): """Scale bounding boxes to match a different image shape. This function scales bounding boxes to match a different image shape while maintaining their aspect ratio. Arguments --------- img1_shape : tuple A tuple representing the shape of the target image in the format (height, width). boxes : torch.Tensor A tensor containing bounding boxes in the format [x1, y1, x2, y2]. img0_shape : tuple A tuple representing the shape of the source image in the format (height, width). ratio_pad : float or None, optional A scaling factor for the bounding boxes. If None, it is calculated based on the aspect ratio of the images. Default is None. Returns ------- A tensor containing the scaled bounding boxes : torch.Tensor """ gain = ( ratio_pad if ratio_pad else min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) ) pad = ( (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2, ) boxes[..., [0, 2]] -= pad[0] boxes[..., [1, 3]] -= pad[1] boxes /= gain boxes = clip_boxes(boxes, img0_shape) return boxes
[docs] def xywh2xyxy(x): """Convert bounding box coordinates from (x, y, width, height) to (x1, y1, x2, y2) format. This function converts bounding box coordinates from the format (center_x, center_y, width, height) to the format (x1, y1, x2, y2), where (x1, y1) represents the top-left corner and (x2, y2) represents the bottom-right corner of the bounding box. Arguments --------- x : torch.Tensor A tensor containing bounding box coordinates in the format (center_x, center_y, width, height). Returns ------- torch.Tensor A tensor containing bounding box coordinates in the format (x1, y1, x2, y2). """ xy = x[..., :2] # center x, y wh = x[..., 2:4] # width, height xy1 = xy - wh / 2 # top left x, y xy2 = xy + wh / 2 # bottom right x, y result = torch.cat((xy1, xy2), dim=-1) return result
[docs] def bbox_format(box): """ Convert a tensor of coordinates [x1, y1, x2, y2] representing two points defining a rectangle to the format [x_min, y_min, x_max, y_max], where x_min, y_min represent the top-left corner, and x_max, y_max represent the bottom-right corner of the rectangle. Arguments --------- box : torch.Tensor A tensor of coordinates in the format [x1, y1, x2, y2] where x1, y1, x2, y2 represent the coordinates of two points defining a rectangle. Returns ------- torch.Tensor The coordinates in the format [x_min, y_min, x_max, y_max] where x_min, y_min represent the top-left vertex, and x_max, y_max represent the bottom-right vertex of the rectangle. """ x1, y1, x2, y2 = box[0], box[1], box[2], box[3] x_min = torch.min(x1, x2) x_max = torch.max(x1, x2) y_min = torch.min(y1, y2) y_max = torch.max(y1, y2) return torch.tensor([x_min, y_min, x_max, y_max])
[docs] def calculate_iou(box1, box2): """ Calculate the Intersection over Union (IoU) between two bounding boxes. Arguments --------- box1 : torch.Tensor First bounding box in the format [x1, y1, x2, y2]. box2 : torch.Tensor Second bounding box in the format [x1, y1, x2, y2]. Returns ------- float The intersection over union of the two bounding boxes. """ x1 = torch.max(box1[0], box2[0]) y1 = torch.max(box1[1], box2[1]) x2 = torch.min(box1[2], box2[2]) y2 = torch.min(box1[3], box2[3]) intersection = torch.clamp(x2 - x1, min=0) * torch.clamp(y2 - y1, min=0) area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) union = area_box1 + area_box2 - intersection iou = intersection / union return iou.item()
[docs] def average_precision(predictions, ground_truth, class_id, iou_threshold=0.5): """ Calculate the average precision (AP) for a specific class in YOLO predictions. Arguments --------- predictions : list List of prediction boxes in the format [x1, y1, x2, y2, confidence, class_id]. ground_truth : list List of ground truth boxes in the same format. class_id : int The class ID for which to calculate AP. iou_threshold : float The IoU threshold for considering a prediction as correct. Returns ------- float The average precision for the specified class. """ predictions = predictions[predictions[:, 5] == class_id] ground_truth = ground_truth[ground_truth[:, 5] == class_id] _, indices = torch.sort(predictions[:, 4], descending=True) predictions = predictions[indices] tp = torch.zeros(len(predictions)) fp = torch.zeros(len(predictions)) gt_count = len(ground_truth) for i, pred in enumerate(predictions): best_iou = 0 for j, gt in enumerate(ground_truth): iou = calculate_iou(pred[:4], gt[:4]) if iou > best_iou and iou >= iou_threshold: best_iou = iou best_gt_idx = j if best_iou > 0: tp[i] = 1 tmp = torch.ones(ground_truth.shape[0]) tmp[best_gt_idx] = 0 ground_truth = ground_truth[tmp.bool()] # ground_truth.pop(best_gt_idx) else: fp[i] = 1 precision = torch.cumsum(tp, dim=0) / ( torch.cumsum(tp, dim=0) + torch.cumsum(fp, dim=0) ) recall = torch.cumsum(tp, dim=0) / gt_count # Compute the average precision using the 11-point interpolation ap = torch.tensor(0.0) for t in torch.arange(0.0, 1.1, 0.1): recall_greater = recall >= t num_true = torch.sum(recall_greater).item() if num_true == 0: p = torch.tensor(0.0) else: p = torch.max(precision[recall_greater]) ap += p / 11.0 return ap.item()
[docs] def mean_average_precision(post_predictions, batch, batch_bboxes, iou_threshold=0.5): """ Calculate the mean average precision (mAP) for all classes in YOLO predictions. Arguments --------- post_predictions : list List of post-processed predictions for bounding boxes. batch : dict A dictionary containing batch information, including image files, batch indices. batch_bboxes : torch.Tensor Tensor containing batch bounding boxes. iou_threshold : float The IoU threshold for considering a prediction as correct. Returns ------- float The mean average precision (mAP). """ batch_size = len(batch["im_file"]) mmAP = [] for batch_el in range(batch_size): ap_sum = 0 num_obj = torch.sum(batch["batch_idx"] == batch_el).item() bboxes = batch_bboxes[batch["batch_idx"] == batch_el] classes = batch["cls"][batch["batch_idx"] == batch_el] gt = torch.cat((bboxes, torch.ones((num_obj, 1)), classes), dim=1) for class_id in range(80): ap = average_precision( post_predictions[batch_el], gt, class_id, iou_threshold ) ap_sum += ap div = torch.unique(gt[:, -1]).size(0) if div == 0: mAP = 0 else: mAP = ap_sum / div mmAP.append(mAP) mmAP = sum(mmAP) / len(mmAP) return mmAP