Yolov8 原始碼解析(四十二)
.\yolov8\ultralytics\utils\loss.py
# 匯入PyTorch庫中需要的模組
import torch
import torch.nn as nn
import torch.nn.functional as F
# 從Ultralytics工具包中匯入一些特定的功能
from ultralytics.utils.metrics import OKS_SIGMA
from ultralytics.utils.ops import crop_mask, xywh2xyxy, xyxy2xywh
from ultralytics.utils.tal import RotatedTaskAlignedAssigner, TaskAlignedAssigner, dist2bbox, dist2rbox, make_anchors
from ultralytics.utils.torch_utils import autocast
# 從當前目錄下的.metrics檔案中匯入bbox_iou和probiou函式
from .metrics import bbox_iou, probiou
# 從當前目錄下的.tal檔案中匯入bbox2dist函式
from .tal import bbox2dist
# 定義一個名為VarifocalLoss的PyTorch模組,繼承自nn.Module
class VarifocalLoss(nn.Module):
"""
Varifocal loss by Zhang et al.
https://arxiv.org/abs/2008.13367.
"""
def __init__(self):
"""Initialize the VarifocalLoss class."""
super().__init__()
@staticmethod
def forward(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
"""Computes varfocal loss."""
# 計算權重
weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label
# 禁用自動混合精度計算
with autocast(enabled=False):
# 計算損失
loss = (
(F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(), reduction="none") * weight)
.mean(1) # 沿著維度1取均值
.sum() # 求和
)
return loss
class FocalLoss(nn.Module):
"""Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)."""
def __init__(self):
"""Initializer for FocalLoss class with no parameters."""
super().__init__()
@staticmethod
def forward(pred, label, gamma=1.5, alpha=0.25):
"""Calculates and updates confusion matrix for object detection/classification tasks."""
# 計算二元交叉熵損失
loss = F.binary_cross_entropy_with_logits(pred, label, reduction="none")
# 計算機率
pred_prob = pred.sigmoid() # logits轉為機率
# 計算p_t值
p_t = label * pred_prob + (1 - label) * (1 - pred_prob)
# 計算調節因子
modulating_factor = (1.0 - p_t) ** gamma
# 應用調節因子到損失上
loss *= modulating_factor
# 如果alpha大於0,則應用alpha因子
if alpha > 0:
alpha_factor = label * alpha + (1 - label) * (1 - alpha)
loss *= alpha_factor
return loss.mean(1).sum()
class DFLoss(nn.Module):
"""Criterion class for computing DFL losses during training."""
def __init__(self, reg_max=16) -> None:
"""Initialize the DFL module."""
super().__init__()
self.reg_max = reg_max
def __call__(self, pred_dist, target):
"""
Return sum of left and right DFL losses.
Distribution Focal Loss (DFL) proposed in Generalized Focal Loss
https://ieeexplore.ieee.org/document/9792391
"""
# 將目標張量限制在 [0, self.reg_max - 1 - 0.01] 的範圍內
target = target.clamp_(0, self.reg_max - 1 - 0.01)
# 將目標張量轉換為長整型(整數)
tl = target.long() # target left
# 計算目標張量的右側鄰近值
tr = tl + 1 # target right
# 計算左側權重
wl = tr - target # weight left
# 計算右側權重
wr = 1 - wl # weight right
# 計算左側損失(使用交叉熵損失函式)
left_loss = F.cross_entropy(pred_dist, tl.view(-1), reduction="none").view(tl.shape) * wl
# 計算右側損失(使用交叉熵損失函式)
right_loss = F.cross_entropy(pred_dist, tr.view(-1), reduction="none").view(tl.shape) * wr
# 返回左側損失和右側損失的平均值(在最後一個維度上求均值,保持維度)
return (left_loss + right_loss).mean(-1, keepdim=True)
# 定義了一個用於計算邊界框損失的模組
class BboxLoss(nn.Module):
"""Criterion class for computing training losses during training."""
def __init__(self, reg_max=16):
"""Initialize the BboxLoss module with regularization maximum and DFL settings."""
super().__init__()
# 如果 reg_max 大於 1,則建立一個 DFLoss 物件,否則設為 None
self.dfl_loss = DFLoss(reg_max) if reg_max > 1 else None
def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
"""Compute IoU loss."""
# 計算前景掩碼中目標得分的總和,並擴充套件維度
weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1)
# 計算預測邊界框和目標邊界框之間的 IoU(Intersection over Union)
iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True)
# 計算 IoU 損失
loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum
# 計算 DFL loss
if self.dfl_loss:
# 將錨點和目標邊界框轉換成距離形式
target_ltrb = bbox2dist(anchor_points, target_bboxes, self.dfl_loss.reg_max - 1)
# 計算 DFL 損失
loss_dfl = self.dfl_loss(pred_dist[fg_mask].view(-1, self.dfl_loss.reg_max), target_ltrb[fg_mask]) * weight
loss_dfl = loss_dfl.sum() / target_scores_sum
else:
# 如果沒有 DFL loss,則設為 0
loss_dfl = torch.tensor(0.0).to(pred_dist.device)
return loss_iou, loss_dfl
# 繼承自 BboxLoss 類,用於處理旋轉邊界框損失
class RotatedBboxLoss(BboxLoss):
"""Criterion class for computing training losses during training."""
def __init__(self, reg_max):
"""Initialize the RotatedBboxLoss module with regularization maximum and DFL settings."""
super().__init__(reg_max)
def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
"""Compute IoU loss for rotated bounding boxes."""
# 計算前景掩碼中目標得分的總和,並擴充套件維度
weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1)
# 計算預測邊界框和目標邊界框之間的機率 IoU
iou = probiou(pred_bboxes[fg_mask], target_bboxes[fg_mask])
# 計算 IoU 損失
loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum
# 計算 DFL loss
if self.dfl_loss:
# 將錨點和目標邊界框轉換成距離形式
target_ltrb = bbox2dist(anchor_points, xywh2xyxy(target_bboxes[..., :4]), self.dfl_loss.reg_max - 1)
# 計算 DFL 損失
loss_dfl = self.dfl_loss(pred_dist[fg_mask].view(-1, self.dfl_loss.reg_max), target_ltrb[fg_mask]) * weight
loss_dfl = loss_dfl.sum() / target_scores_sum
else:
# 如果沒有 DFL loss,則設為 0
loss_dfl = torch.tensor(0.0).to(pred_dist.device)
return loss_iou, loss_dfl
# 用於計算關鍵點損失的模組
class KeypointLoss(nn.Module):
"""Criterion class for computing training losses."""
def __init__(self, sigmas) -> None:
"""Initialize the KeypointLoss class."""
super().__init__()
# 初始化關鍵點損失類,接收 sigmas 引數
self.sigmas = sigmas
# 定義一個方法,用於計算預測關鍵點和真實關鍵點之間的損失因子和歐氏距離損失。
def forward(self, pred_kpts, gt_kpts, kpt_mask, area):
# 計算預測關鍵點與真實關鍵點在 x 和 y 方向上的平方差,得到歐氏距離的平方
d = (pred_kpts[..., 0] - gt_kpts[..., 0]).pow(2) + (pred_kpts[..., 1] - gt_kpts[..., 1]).pow(2)
# 計算關鍵點損失因子,用於調整不同關鍵點的重要性,避免稀疏區域對損失的過度影響
kpt_loss_factor = kpt_mask.shape[1] / (torch.sum(kpt_mask != 0, dim=1) + 1e-9)
# 計算歐氏距離損失,根據預設的尺度引數 self.sigmas 和區域面積 area 進行調整
e = d / ((2 * self.sigmas).pow(2) * (area + 1e-9) * 2) # 來自於 cocoeval 的公式
# 返回加權後的關鍵點損失的均值,其中損失透過 kpt_mask 進行加權,確保只考慮有效關鍵點
return (kpt_loss_factor.view(-1, 1) * ((1 - torch.exp(-e)) * kpt_mask)).mean()
class v8DetectionLoss:
"""Criterion class for computing training losses."""
def __init__(self, model, tal_topk=10): # model must be de-paralleled
"""Initializes v8DetectionLoss with the model, defining model-related properties and BCE loss function."""
# 獲取模型的裝置資訊
device = next(model.parameters()).device # get model device
# 從模型中獲取超引數
h = model.args # hyperparameters
# 獲取最後一個模型元件,通常是 Detect() 模組
m = model.model[-1] # Detect() module
# 使用 nn.BCEWithLogitsLoss 計算 BCE 損失,設定為不進行歸約
self.bce = nn.BCEWithLogitsLoss(reduction="none")
# 儲存超引數
self.hyp = h
# 獲取模型的步長資訊
self.stride = m.stride # model strides
# 獲取模型的類別數
self.nc = m.nc # number of classes
# 設定輸出通道數,包括類別和迴歸目標
self.no = m.nc + m.reg_max * 4
# 獲取模型的最大回歸目標數量
self.reg_max = m.reg_max
# 儲存模型的裝置資訊
self.device = device
# 判斷是否使用 DFL(Distribution-based Focal Loss)
self.use_dfl = m.reg_max > 1
# 初始化任務對齊分配器,用於匹配目標框和錨點框
self.assigner = TaskAlignedAssigner(topk=tal_topk, num_classes=self.nc, alpha=0.5, beta=6.0)
# 初始化邊界框損失函式,使用指定數量的迴歸目標
self.bbox_loss = BboxLoss(m.reg_max).to(device)
# 建立一個張量,用於後續的數學運算,位於指定的裝置上
self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device)
def preprocess(self, targets, batch_size, scale_tensor):
"""Preprocesses the target counts and matches with the input batch size to output a tensor."""
# 獲取目標張量的維度資訊
nl, ne = targets.shape
# 如果目標張量為空,則返回一個零張量
if nl == 0:
out = torch.zeros(batch_size, 0, ne - 1, device=self.device)
else:
# 獲取影像索引和其對應的計數
i = targets[:, 0] # image index
_, counts = i.unique(return_counts=True)
counts = counts.to(dtype=torch.int32)
# 建立零張量,用於儲存預處理後的目標資料
out = torch.zeros(batch_size, counts.max(), ne - 1, device=self.device)
for j in range(batch_size):
# 獲取與當前批次影像索引匹配的目標
matches = i == j
n = matches.sum()
if n:
out[j, :n] = targets[matches, 1:]
# 對輸出的邊界框座標進行縮放和轉換
out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
return out
def bbox_decode(self, anchor_points, pred_dist):
"""Decode predicted object bounding box coordinates from anchor points and distribution."""
# 如果使用 DFL,則對預測分佈進行處理
if self.use_dfl:
b, a, c = pred_dist.shape # batch, anchors, channels
# 對預測分佈進行解碼,使用預定義的投影張量
pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
# 另外兩種可能的解碼方式,根據實際需求選擇
# pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
# pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
# 返回解碼後的邊界框座標
return dist2bbox(pred_dist, anchor_points, xywh=False)
def __call__(self, preds, batch):
"""Calculate the sum of the loss for box, cls and dfl multiplied by batch size."""
# 初始化一個全零的張量,用於儲存損失值,包括box、cls和dfl
loss = torch.zeros(3, device=self.device) # box, cls, dfl
# 如果preds是元組,則取第二個元素作為feats,否則直接使用preds
feats = preds[1] if isinstance(preds, tuple) else preds
# 將feats中的特徵拼接並分割,得到預測的分佈(pred_distri)和分數(pred_scores)
pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
(self.reg_max * 4, self.nc), 1
)
# 調整張量維度,使得pred_scores和pred_distri的維度更適合後續計算
pred_scores = pred_scores.permute(0, 2, 1).contiguous()
pred_distri = pred_distri.permute(0, 2, 1).contiguous()
# 獲取pred_scores的資料型別和batch size
dtype = pred_scores.dtype
batch_size = pred_scores.shape[0]
# 計算影像尺寸,以張量形式儲存在imgsz中,單位是畫素
imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w)
# 使用make_anchors函式生成錨點(anchor_points)和步長張量(stride_tensor)
anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
# 處理目標資料,包括影像索引、類別和邊界框,轉換為Tensor並傳輸到裝置上
targets = torch.cat((batch["batch_idx"].view(-1, 1), batch["cls"].view(-1, 1), batch["bboxes"]), 1)
targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
# 將目標資料拆分為類別標籤(gt_labels)和邊界框(gt_bboxes)
gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy
# 生成用於過濾的掩碼(mask_gt),判斷邊界框是否有效
mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0.0)
# 解碼預測的邊界框(pred_bboxes),得到真實座標
pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4)
# 使用分配器(assigner)計算匹配的目標邊界框和分數
_, target_bboxes, target_scores, fg_mask, _ = self.assigner(
pred_scores.detach().sigmoid(),
(pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
anchor_points * stride_tensor,
gt_labels,
gt_bboxes,
mask_gt,
)
# 計算目標分數之和,用於損失計算的歸一化
target_scores_sum = max(target_scores.sum(), 1)
# 類別損失計算,使用二元交叉熵損失(BCE)
loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE
# 如果有前景掩碼(fg_mask)存在,則計算邊界框損失和分佈損失
if fg_mask.sum():
target_bboxes /= stride_tensor
loss[0], loss[2] = self.bbox_loss(
pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask
)
# 損失值乘以超引數中的各自增益係數
loss[0] *= self.hyp.box # box gain
loss[1] *= self.hyp.cls # cls gain
loss[2] *= self.hyp.dfl # dfl gain
# 返回損失值的總和乘以batch size,以及分離的損失張量
return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl)
class v8SegmentationLoss(v8DetectionLoss):
"""Criterion class for computing training losses."""
def __init__(self, model): # model must be de-paralleled
"""Initializes the v8SegmentationLoss class, taking a de-paralleled model as argument."""
super().__init__(model)
self.overlap = model.args.overlap_mask
@staticmethod
def single_mask_loss(
gt_mask: torch.Tensor, pred: torch.Tensor, proto: torch.Tensor, xyxy: torch.Tensor, area: torch.Tensor
) -> torch.Tensor:
"""
Compute the instance segmentation loss for a single image.
Args:
gt_mask (torch.Tensor): Ground truth mask of shape (n, H, W), where n is the number of objects.
pred (torch.Tensor): Predicted mask coefficients of shape (n, 32).
proto (torch.Tensor): Prototype masks of shape (32, H, W).
xyxy (torch.Tensor): Ground truth bounding boxes in xyxy format, normalized to [0, 1], of shape (n, 4).
area (torch.Tensor): Area of each ground truth bounding box of shape (n,).
Returns:
(torch.Tensor): The calculated mask loss for a single image.
Notes:
The function uses the equation pred_mask = torch.einsum('in,nhw->ihw', pred, proto) to produce the
predicted masks from the prototype masks and predicted mask coefficients.
"""
# Compute predicted masks using prototype masks and coefficients
pred_mask = torch.einsum("in,nhw->ihw", pred, proto) # (n, 32) @ (32, H, W) -> (n, H, W)
# Compute binary cross entropy loss between predicted masks and ground truth masks
loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction="none")
# Crop the loss using bounding boxes, then compute mean per instance and sum across instances
return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).sum()
def calculate_segmentation_loss(
self,
fg_mask: torch.Tensor,
masks: torch.Tensor,
target_gt_idx: torch.Tensor,
target_bboxes: torch.Tensor,
batch_idx: torch.Tensor,
proto: torch.Tensor,
pred_masks: torch.Tensor,
imgsz: torch.Tensor,
overlap: bool,
) -> torch.Tensor:
"""
Calculate the loss for instance segmentation.
Args:
fg_mask (torch.Tensor): A binary tensor of shape (BS, N_anchors) indicating which anchors are positive.
masks (torch.Tensor): Ground truth masks of shape (BS, H, W) if `overlap` is False, otherwise (BS, ?, H, W).
target_gt_idx (torch.Tensor): Indexes of ground truth objects for each anchor of shape (BS, N_anchors).
target_bboxes (torch.Tensor): Ground truth bounding boxes for each anchor of shape (BS, N_anchors, 4).
batch_idx (torch.Tensor): Batch indices of shape (N_labels_in_batch, 1).
proto (torch.Tensor): Prototype masks of shape (BS, 32, H, W).
pred_masks (torch.Tensor): Predicted masks for each anchor of shape (BS, N_anchors, 32).
imgsz (torch.Tensor): Size of the input image as a tensor of shape (2), i.e., (H, W).
overlap (bool): Whether the masks in `masks` tensor overlap.
Returns:
(torch.Tensor): The calculated loss for instance segmentation.
Notes:
The batch loss can be computed for improved speed at higher memory usage.
For example, pred_mask can be computed as follows:
pred_mask = torch.einsum('in,nhw->ihw', pred, proto) # (i, 32) @ (32, 160, 160) -> (i, 160, 160)
"""
_, _, mask_h, mask_w = proto.shape # 獲取原型掩模的高度和寬度
loss = 0 # 初始化損失值為0
# Normalize to 0-1
target_bboxes_normalized = target_bboxes / imgsz[[1, 0, 1, 0]] # 將目標邊界框歸一化到0-1範圍
# Areas of target bboxes
marea = xyxy2xywh(target_bboxes_normalized)[..., 2:].prod(2) # 計算目標邊界框的面積
# Normalize to mask size
mxyxy = target_bboxes_normalized * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=proto.device) # 將邊界框歸一化到掩模大小
# 遍歷每個樣本中的前景掩模、目標索引、預測掩模、原型、歸一化邊界框、目標面積、掩模
for i, single_i in enumerate(zip(fg_mask, target_gt_idx, pred_masks, proto, mxyxy, marea, masks)):
fg_mask_i, target_gt_idx_i, pred_masks_i, proto_i, mxyxy_i, marea_i, masks_i = single_i
if fg_mask_i.any(): # 如果前景掩模中有任何True值
mask_idx = target_gt_idx_i[fg_mask_i] # 獲取前景掩模對應的目標索引
if overlap:
gt_mask = masks_i == (mask_idx + 1).view(-1, 1, 1) # 如果存在重疊,則獲取真實掩模
gt_mask = gt_mask.float() # 轉換為浮點數
else:
gt_mask = masks[batch_idx.view(-1) == i][mask_idx] # 否則直接獲取真實掩模
# 計算單個掩模損失
loss += self.single_mask_loss(
gt_mask, pred_masks_i[fg_mask_i], proto_i, mxyxy_i[fg_mask_i], marea_i[fg_mask_i]
)
# WARNING: lines below prevents Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove
else:
# 防止在多GPU分散式資料並行處理中出現未使用梯度的錯誤
loss += (proto * 0).sum() + (pred_masks * 0).sum() # 將inf和相加可能導致nan損失
# 返回平均每個前景掩模的損失
return loss / fg_mask.sum()
class v8PoseLoss(v8DetectionLoss):
"""Criterion class for computing training losses."""
def __init__(self, model): # model must be de-paralleled
"""Initializes v8PoseLoss with model, sets keypoint variables and declares a keypoint loss instance."""
super().__init__(model)
self.kpt_shape = model.model[-1].kpt_shape
self.bce_pose = nn.BCEWithLogitsLoss()
# Check if the model deals with pose keypoints (17 keypoints with 3 coordinates each)
is_pose = self.kpt_shape == [17, 3]
nkpt = self.kpt_shape[0] # number of keypoints
# Set sigmas for keypoint loss calculation based on whether it's pose or not
sigmas = torch.from_numpy(OKS_SIGMA).to(self.device) if is_pose else torch.ones(nkpt, device=self.device) / nkpt
self.keypoint_loss = KeypointLoss(sigmas=sigmas)
@staticmethod
def kpts_decode(anchor_points, pred_kpts):
"""Decodes predicted keypoints to image coordinates."""
y = pred_kpts.clone()
# Scale keypoints coordinates
y[..., :2] *= 2.0
# Translate keypoints to their anchor points
y[..., 0] += anchor_points[:, [0]] - 0.5
y[..., 1] += anchor_points[:, [1]] - 0.5
return y
def calculate_keypoints_loss(
self, masks, target_gt_idx, keypoints, batch_idx, stride_tensor, target_bboxes, pred_kpts
):
"""Calculate the keypoints loss."""
# Implementation of keypoints loss calculation goes here
pass # Placeholder for actual implementation
class v8ClassificationLoss:
"""Criterion class for computing training losses."""
def __call__(self, preds, batch):
"""Compute the classification loss between predictions and true labels."""
loss = F.cross_entropy(preds, batch["cls"], reduction="mean")
loss_items = loss.detach()
return loss, loss_items
class v8OBBLoss(v8DetectionLoss):
"""Calculates losses for object detection, classification, and box distribution in rotated YOLO models."""
def __init__(self, model):
"""Initializes v8OBBLoss with model, assigner, and rotated bbox loss; note model must be de-paralleled."""
super().__init__(model)
self.assigner = RotatedTaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0)
self.bbox_loss = RotatedBboxLoss(self.reg_max).to(self.device)
def preprocess(self, targets, batch_size, scale_tensor):
"""Preprocesses the target counts and matches with the input batch size to output a tensor."""
if targets.shape[0] == 0:
# If no targets, return zero tensor
out = torch.zeros(batch_size, 0, 6, device=self.device)
else:
i = targets[:, 0] # image index
_, counts = i.unique(return_counts=True)
counts = counts.to(dtype=torch.int32)
# Initialize output tensor with proper dimensions
out = torch.zeros(batch_size, counts.max(), 6, device=self.device)
for j in range(batch_size):
matches = i == j
n = matches.sum()
if n:
# Extract and scale bounding boxes, then concatenate with class labels
bboxes = targets[matches, 2:]
bboxes[..., :4].mul_(scale_tensor)
out[j, :n] = torch.cat([targets[matches, 1:2], bboxes], dim=-1)
return out
def bbox_decode(self, anchor_points, pred_dist, pred_angle):
"""
Decode predicted object bounding box coordinates from anchor points and distribution.
Args:
anchor_points (torch.Tensor): Anchor points, (h*w, 2).
pred_dist (torch.Tensor): Predicted rotated distance, (bs, h*w, 4).
pred_angle (torch.Tensor): Predicted angle, (bs, h*w, 1).
Returns:
(torch.Tensor): Predicted rotated bounding boxes with angles, (bs, h*w, 5).
"""
# 如果使用 DFL(Dynamic Feature Learning),對預測的距離進行處理
if self.use_dfl:
# 獲取批次大小、錨點數、通道數
b, a, c = pred_dist.shape # batch, anchors, channels
# 重新調整預測距離的形狀並進行 softmax 處理,然後乘以投影矩陣
pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
# 將解碼後的旋轉邊界框座標和預測的角度拼接在一起並返回
return torch.cat((dist2rbox(pred_dist, pred_angle, anchor_points), pred_angle), dim=-1)
# 定義一個名為 E2EDetectLoss 的類,用於計算訓練損失
class E2EDetectLoss:
"""Criterion class for computing training losses."""
def __init__(self, model):
"""初始化 E2EDetectLoss 類,使用提供的模型初始化一個一對多和一個對一檢測損失。"""
self.one2many = v8DetectionLoss(model, tal_topk=10) # 初始化一對多檢測損失物件
self.one2one = v8DetectionLoss(model, tal_topk=1) # 初始化一對一檢測損失物件
def __call__(self, preds, batch):
"""呼叫例項時計算框、類別和深度特徵點損失的總和,乘以批次大小。"""
preds = preds[1] if isinstance(preds, tuple) else preds # 如果 preds 是元組,則使用第二個元素
one2many = preds["one2many"] # 獲取預測結果中的一對多損失
loss_one2many = self.one2many(one2many, batch) # 計算一對多損失
one2one = preds["one2one"] # 獲取預測結果中的一對一損失
loss_one2one = self.one2one(one2one, batch) # 計算一對一損失
return loss_one2many[0] + loss_one2one[0], loss_one2many[1] + loss_one2one[1]
# 返回兩個損失的總和,分別對應框和類別損失,以及深度特徵點損失
.\yolov8\ultralytics\utils\metrics.py
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""Model validation metrics."""
import math
import warnings
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import torch
from ultralytics.utils import LOGGER, SimpleClass, TryExcept, plt_settings
# Object Keypoint Similarity (OKS) sigmas for different keypoints
OKS_SIGMA = (
np.array([0.26, 0.25, 0.25, 0.35, 0.35, 0.79, 0.79, 0.72, 0.72, 0.62, 0.62, 1.07, 1.07, 0.87, 0.87, 0.89, 0.89])
/ 10.0
)
def bbox_ioa(box1, box2, iou=False, eps=1e-7):
"""
Calculate the intersection over box2 area given box1 and box2. Boxes are in x1y1x2y2 format.
Args:
box1 (np.ndarray): A numpy array of shape (n, 4) representing n bounding boxes.
box2 (np.ndarray): A numpy array of shape (m, 4) representing m bounding boxes.
iou (bool): Calculate the standard IoU if True else return inter_area/box2_area.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
Returns:
(np.ndarray): A numpy array of shape (n, m) representing the intersection over box2 area.
"""
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1.T
b2_x1, b2_y1, b2_x2, b2_y2 = box2.T
# Intersection area calculation
inter_area = (np.minimum(b1_x2[:, None], b2_x2) - np.maximum(b1_x1[:, None], b2_x1)).clip(0) * (
np.minimum(b1_y2[:, None], b2_y2) - np.maximum(b1_y1[:, None], b2_y1)
).clip(0)
# Box2 area calculation
area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
if iou:
box1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
area = area + box1_area[:, None] - inter_area
# Intersection over box2 area
return inter_area / (area + eps)
def box_iou(box1, box2, eps=1e-7):
"""
Calculate intersection-over-union (IoU) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
Based on https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
Args:
box1 (torch.Tensor): A tensor of shape (N, 4) representing N bounding boxes.
box2 (torch.Tensor): A tensor of shape (M, 4) representing M bounding boxes.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
Returns:
(torch.Tensor): An NxM tensor containing the pairwise IoU values for every element in box1 and box2.
"""
# Convert box coordinates to float for accurate computation
(a1, a2), (b1, b2) = box1.float().unsqueeze(1).chunk(2, 2), box2.float().unsqueeze(0).chunk(2, 2)
# Calculate intersection area
inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp_(0).prod(2)
# Compute IoU
return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps)
def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
"""
Calculate Intersection over Union (IoU) of box1(1, 4) to box2(n, 4).
This function calculates IoU considering different variants such as Generalized IoU (GIoU),
Distance IoU (DIoU), and Complete IoU (CIoU) if specified.
Args:
box1 (torch.Tensor): A tensor representing a single bounding box of shape (1, 4).
box2 (torch.Tensor): A tensor representing multiple bounding boxes of shape (n, 4).
xywh (bool, optional): If True, treats boxes as (x_center, y_center, width, height).
GIoU (bool, optional): If True, compute Generalized IoU. Defaults to False.
DIoU (bool, optional): If True, compute Distance IoU. Defaults to False.
CIoU (bool, optional): If True, compute Complete IoU. Defaults to False.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
Returns:
(torch.Tensor): IoU values between box1 and each box in box2, of shape (n,).
"""
Args:
box1 (torch.Tensor): A tensor representing a single bounding box with shape (1, 4).
box2 (torch.Tensor): A tensor representing n bounding boxes with shape (n, 4).
xywh (bool, optional): If True, input boxes are in (x, y, w, h) format. If False, input boxes are in
(x1, y1, x2, y2) format. Defaults to True.
GIoU (bool, optional): If True, calculate Generalized IoU. Defaults to False.
DIoU (bool, optional): If True, calculate Distance IoU. Defaults to False.
CIoU (bool, optional): If True, calculate Complete IoU. Defaults to False.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
Returns:
(torch.Tensor): IoU, GIoU, DIoU, or CIoU values depending on the specified flags.
"""
# 根據輸入的格式標誌,獲取邊界框的座標資訊
if xywh: # 如果輸入格式為 (x, y, w, h)
# 將 box1 和 box2 按照座標和尺寸分塊
(x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
# 計算各自的一半寬度和高度
w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
# 計算邊界框的四個頂點座標
b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
else: # 如果輸入格式為 (x1, y1, x2, y2)
# 將 box1 和 box2 按照座標分塊
b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
# 計算邊界框的寬度和高度,並新增一個小值 eps 避免除以零
w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
# 計算交集面積
inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp_(0) * (
b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)
).clamp_(0)
# 計算並集面積
union = w1 * h1 + w2 * h2 - inter + eps
# 計算 IoU
iou = inter / union
if CIoU or DIoU or GIoU:
# 計算最小包圍框的寬度和高度
cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1)
ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1)
if CIoU or DIoU: # 如果是 Distance IoU 或者 Complete IoU
# 計算最小包圍框的對角線的平方
c2 = cw.pow(2) + ch.pow(2) + eps
# 計算中心距離的平方
rho2 = (
(b2_x1 + b2_x2 - b1_x1 - b1_x2).pow(2) + (b2_y1 + b2_y2 - b1_y1 - b1_y2).pow(2)
) / 4
if CIoU: # 如果是 Complete IoU
v = (4 / math.pi**2) * ((w2 / h2).atan() - (w1 / h1).atan()).pow(2)
with torch.no_grad():
alpha = v / (v - iou + (1 + eps))
return iou - (rho2 / c2 + v * alpha) # 計算 CIoU
return iou - rho2 / c2 # 計算 DIoU
# 計算最小包圍框的面積
c_area = cw * ch + eps
return iou - (c_area - union) / c_area # 計算 GIoU
return iou # 返回 IoU
# 計算兩個方向邊界框之間的機率 IoU,參考論文 https://arxiv.org/pdf/2106.06072v1.pdf
def probiou(obb1, obb2, CIoU=False, eps=1e-7):
"""
Calculate the prob IoU between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
Args:
obb1 (torch.Tensor): A tensor of shape (N, 5) representing the first oriented bounding boxes in xywhr format.
obb2 (torch.Tensor): A tensor of shape (M, 5) representing the second oriented bounding boxes in xywhr format.
CIoU (bool, optional): If True, compute Complete IoU. Defaults to False.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
Returns:
(torch.Tensor): A tensor of shape (N, M) representing the probabilities of IoU between obb1 and obb2.
"""
# 將 Gaussian 邊界框合併,忽略中心點(前兩列)因為這裡不需要
gbbs = torch.cat((obb1[:, 2:4].pow(2) / 12, obb1[:, 4:]), dim=-1)
a, b, c = gbbs.split(1, dim=-1)
cos = c.cos()
sin = c.sin()
cos2 = cos.pow(2)
sin2 = sin.pow(2)
# 計算旋轉邊界框的協方差矩陣
return a * cos2 + b * sin2, a * sin2 + b * cos2, (a - b) * cos * sin
Args:
obb1 (torch.Tensor): A tensor of shape (N, 5) representing ground truth obbs, with xywhr format.
obb2 (torch.Tensor): A tensor of shape (N, 5) representing predicted obbs, with xywhr format.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
Returns:
(torch.Tensor): A tensor of shape (N, ) representing obb similarities.
"""
# Splitting the x and y coordinates from obb1 and obb2
x1, y1 = obb1[..., :2].split(1, dim=-1)
x2, y2 = obb2[..., :2].split(1, dim=-1)
# Calculating covariance matrix components for obb1 and obb2
a1, b1, c1 = _get_covariance_matrix(obb1)
a2, b2, c2 = _get_covariance_matrix(obb2)
# Calculation of terms t1, t2, and t3 for IoU computation
t1 = (
((a1 + a2) * (y1 - y2).pow(2) + (b1 + b2) * (x1 - x2).pow(2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)
) * 0.25
t2 = (((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)) * 0.5
t3 = (
((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2))
/ (4 * ((a1 * b1 - c1.pow(2)).clamp_(0) * (a2 * b2 - c2.pow(2)).clamp_(0)).sqrt() + eps)
+ eps
).log() * 0.5
# Combined term for boundary distance
bd = (t1 + t2 + t3).clamp(eps, 100.0)
# Hausdorff distance calculation
hd = (1.0 - (-bd).exp() + eps).sqrt()
# Intersection over Union (IoU) computation
iou = 1 - hd
# Compute Complete IoU (CIoU) if CIoU flag is True
if CIoU:
# Splitting width and height components from obb1 and obb2
w1, h1 = obb1[..., 2:4].split(1, dim=-1)
w2, h2 = obb2[..., 2:4].split(1, dim=-1)
# Calculating v value based on width and height ratios
v = (4 / math.pi**2) * ((w2 / h2).atan() - (w1 / h1).atan()).pow(2)
# Compute alpha factor and adjust IoU for CIoU
with torch.no_grad():
alpha = v / (v - iou + (1 + eps))
return iou - v * alpha # CIoU
# Return regular IoU if CIoU flag is False
return iou
# 計算兩個有方向邊界框之間的機率IoU,參考論文 https://arxiv.org/pdf/2106.06072v1.pdf
def batch_probiou(obb1, obb2, eps=1e-7):
"""
Calculate the prob IoU between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
Args:
obb1 (torch.Tensor | np.ndarray): A tensor of shape (N, 5) representing ground truth obbs, with xywhr format.
obb2 (torch.Tensor | np.ndarray): A tensor of shape (M, 5) representing predicted obbs, with xywhr format.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
Returns:
(torch.Tensor): A tensor of shape (N, M) representing obb similarities.
"""
# 將輸入obb1和obb2轉換為torch.Tensor,如果它們是np.ndarray型別的話
obb1 = torch.from_numpy(obb1) if isinstance(obb1, np.ndarray) else obb1
obb2 = torch.from_numpy(obb2) if isinstance(obb2, np.ndarray) else obb2
# 分割xy座標和寬高比例與旋轉角度資訊,以便後續處理
x1, y1 = obb1[..., :2].split(1, dim=-1)
x2, y2 = (x.squeeze(-1)[None] for x in obb2[..., :2].split(1, dim=-1))
# 計算相關的協方差矩陣分量
a1, b1, c1 = _get_covariance_matrix(obb1)
a2, b2, c2 = (x.squeeze(-1)[None] for x in _get_covariance_matrix(obb2))
# 計算機率IoU的三個部分
t1 = (
((a1 + a2) * (y1 - y2).pow(2) + (b1 + b2) * (x1 - x2).pow(2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)
) * 0.25
t2 = (((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)) * 0.5
t3 = (
((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2))
/ (4 * ((a1 * b1 - c1.pow(2)).clamp_(0) * (a2 * b2 - c2.pow(2)).clamp_(0)).sqrt() + eps)
+ eps
).log() * 0.5
# 組合三個部分,並進行一些修正和限制
bd = (t1 + t2 + t3).clamp(eps, 100.0)
hd = (1.0 - (-bd).exp() + eps).sqrt()
# 返回1減去修正的IoU機率
return 1 - hd
# 計算平滑的正負二元交叉熵目標
def smooth_BCE(eps=0.1):
"""
Computes smoothed positive and negative Binary Cross-Entropy targets.
This function calculates positive and negative label smoothing BCE targets based on a given epsilon value.
For implementation details, refer to https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441.
Args:
eps (float, optional): The epsilon value for label smoothing. Defaults to 0.1.
Returns:
(tuple): A tuple containing the positive and negative label smoothing BCE targets.
"""
# 計算平滑後的正負二元交叉熵目標
return 1.0 - 0.5 * eps, 0.5 * eps
class ConfusionMatrix:
"""
A class for calculating and updating a confusion matrix for object detection and classification tasks.
Attributes:
task (str): The type of task, either 'detect' or 'classify'.
matrix (np.ndarray): The confusion matrix, with dimensions depending on the task.
nc (int): The number of classes.
conf (float): The confidence threshold for detections.
iou_thres (float): The Intersection over Union threshold.
"""
# 用於計算和更新目標檢測和分類任務的混淆矩陣的類定義
def __init__(self, task, nc, conf=0.5, iou_thres=0.5):
self.task = task
self.matrix = np.zeros((nc, nc), dtype=np.int64)
self.nc = nc
self.conf = conf
self.iou_thres = iou_thres
# 更新混淆矩陣中的條目
def update_matrix(self, targets, preds):
"""
Update the confusion matrix with new target and prediction entries.
Args:
targets (np.ndarray): An array containing the ground truth labels.
preds (np.ndarray): An array containing the predicted labels.
"""
for t, p in zip(targets, preds):
self.matrix[t, p] += 1
# 重置混淆矩陣
def reset_matrix(self):
"""Reset the confusion matrix to all zeros."""
self.matrix.fill(0)
# 列印混淆矩陣的當前狀態
def print_matrix(self):
"""Print the current state of the confusion matrix."""
print(self.matrix)
def __init__(self, nc, conf=0.25, iou_thres=0.45, task="detect"):
"""
Initialize attributes for the YOLO model.
Args:
nc (int): Number of classes.
conf (float): Confidence threshold, default is 0.25, adjusted to 0.25 if None or 0.001.
iou_thres (float): IoU (Intersection over Union) threshold.
task (str): Task type, either "detect" or other.
Initializes:
self.task (str): Task type.
self.matrix (np.ndarray): Confusion matrix initialized based on task type and number of classes.
self.nc (int): Number of classes.
self.conf (float): Confidence threshold.
self.iou_thres (float): IoU threshold.
"""
self.task = task
self.matrix = np.zeros((nc + 1, nc + 1)) if self.task == "detect" else np.zeros((nc, nc))
self.nc = nc # number of classes
self.conf = 0.25 if conf in {None, 0.001} else conf # apply 0.25 if default val conf is passed
self.iou_thres = iou_thres
def process_cls_preds(self, preds, targets):
"""
Update confusion matrix for classification task.
Args:
preds (Array[N, min(nc,5)]): Predicted class labels.
targets (Array[N, 1]): Ground truth class labels.
"""
preds, targets = torch.cat(preds)[:, 0], torch.cat(targets)
for p, t in zip(preds.cpu().numpy(), targets.cpu().numpy()):
self.matrix[p][t] += 1
def process_batch(self, detections, gt_bboxes, gt_cls):
"""
Update confusion matrix for object detection task.
Args:
detections (Array[N, 6] | Array[N, 7]): Detected bounding boxes and their associated information.
Each row should contain (x1, y1, x2, y2, conf, class)
or with an additional element `angle` when it's obb.
gt_bboxes (Array[M, 4]| Array[N, 5]): Ground truth bounding boxes with xyxy/xyxyr format.
gt_cls (Array[M]): The class labels.
"""
# 檢查標籤是否為空
if gt_cls.shape[0] == 0:
if detections is not None:
# 根據置信度閾值過濾掉低置信度的檢測結果
detections = detections[detections[:, 4] > self.conf]
# 提取檢測結果的類別
detection_classes = detections[:, 5].int()
for dc in detection_classes:
self.matrix[dc, self.nc] += 1 # 假陽性
return
# 如果沒有檢測結果
if detections is None:
# 提取真實標籤的類別
gt_classes = gt_cls.int()
for gc in gt_classes:
self.matrix[self.nc, gc] += 1 # 背景 FN
return
# 根據置信度閾值過濾掉低置信度的檢測結果
detections = detections[detections[:, 4] > self.conf]
# 提取真實標籤的類別
gt_classes = gt_cls.int()
# 提取檢測結果的類別
detection_classes = detections[:, 5].int()
# 判斷是否為帶有角度資訊的檢測結果和真實標籤
is_obb = detections.shape[1] == 7 and gt_bboxes.shape[1] == 5
# 計算 IoU(交併比)
iou = (
batch_probiou(gt_bboxes, torch.cat([detections[:, :4], detections[:, -1:]], dim=-1))
if is_obb
else box_iou(gt_bboxes, detections[:, :4])
)
# 根據 IoU 閾值篩選匹配結果
x = torch.where(iou > self.iou_thres)
if x[0].shape[0]:
matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()
if x[0].shape[0] > 1:
matches = matches[matches[:, 2].argsort()[::-1]]
matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
matches = matches[matches[:, 2].argsort()[::-1]]
matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
else:
matches = np.zeros((0, 3))
# 判斷是否有匹配結果
n = matches.shape[0] > 0
m0, m1, _ = matches.transpose().astype(int)
# 更新混淆矩陣
for i, gc in enumerate(gt_classes):
j = m0 == i
if n and sum(j) == 1:
self.matrix[detection_classes[m1[j]], gc] += 1 # 正確
else:
self.matrix[self.nc, gc] += 1 # 真實背景
# 如果有匹配結果,更新混淆矩陣
if n:
for i, dc in enumerate(detection_classes):
if not any(m1 == i):
self.matrix[dc, self.nc] += 1 # 預測背景
def matrix(self):
"""Returns the confusion matrix."""
return self.matrix
def tp_fp(self):
"""Returns true positives and false positives."""
tp = self.matrix.diagonal() # 提取混淆矩陣的對角線元素,即 true positives
fp = self.matrix.sum(1) - tp # 計算每行的和減去對角線元素,得到 false positives
# fn = self.matrix.sum(0) - tp # false negatives (missed detections) -- 該行被註釋掉,不起作用
return (tp[:-1], fp[:-1]) if self.task == "detect" else (tp, fp) # 如果任務是檢測,移除背景類別後返回結果
@TryExcept("WARNING ⚠️ ConfusionMatrix plot failure")
@plt_settings()
def plot(self, normalize=True, save_dir="", names=(), on_plot=None):
"""
Plot the confusion matrix using seaborn and save it to a file.
Args:
normalize (bool): Whether to normalize the confusion matrix.
save_dir (str): Directory where the plot will be saved.
names (tuple): Names of classes, used as labels on the plot.
on_plot (func): An optional callback to pass plots path and data when they are rendered.
"""
import seaborn # 引入 seaborn 庫,用於繪製混淆矩陣圖
array = self.matrix / ((self.matrix.sum(0).reshape(1, -1) + 1e-9) if normalize else 1) # 對混淆矩陣進行列歸一化處理
array[array < 0.005] = np.nan # 將小於 0.005 的值設為 NaN,不在圖上標註
fig, ax = plt.subplots(1, 1, figsize=(12, 9), tight_layout=True) # 建立圖和軸物件,設定圖的大小和佈局
nc, nn = self.nc, len(names) # 類別數和類別名稱列表的長度
seaborn.set_theme(font_scale=1.0 if nc < 50 else 0.8) # 設定字型大小,根據類別數決定
labels = (0 < nn < 99) and (nn == nc) # 根據類別名稱是否符合要求決定是否應用於刻度標籤
ticklabels = (list(names) + ["background"]) if labels else "auto" # 根據條件設定刻度標籤
with warnings.catch_warnings():
warnings.simplefilter("ignore") # 忽略警告資訊,避免空矩陣的 RuntimeWarning: All-NaN slice encountered
seaborn.heatmap(
array,
ax=ax,
annot=nc < 30, # 如果類別數小於 30,則在圖上標註數值
annot_kws={"size": 8}, # 標註的字型大小
cmap="Blues", # 使用藍色調色盤
fmt=".2f" if normalize else ".0f", # 數值格式,歸一化時保留兩位小數,否則取整數
square=True, # 方形圖
vmin=0.0, # 最小值為 0
xticklabels=ticklabels, # X 軸刻度標籤
yticklabels=ticklabels, # Y 軸刻度標籤
).set_facecolor((1, 1, 1)) # 設定圖的背景色為白色
title = "Confusion Matrix" + " Normalized" * normalize # 圖表標題,根據是否歸一化新增字尾
ax.set_xlabel("True") # X 軸標籤
ax.set_ylabel("Predicted") # Y 軸標籤
ax.set_title(title) # 設定圖表標題
plot_fname = Path(save_dir) / f'{title.lower().replace(" ", "_")}.png' # 圖片儲存的檔名
fig.savefig(plot_fname, dpi=250) # 儲存圖表為 PNG 檔案,設定 DPI 為 250
plt.close(fig) # 關閉圖表
if on_plot:
on_plot(plot_fname) # 如果有回撥函式,則呼叫該函式,並傳遞圖表檔案路徑
def print(self):
"""Print the confusion matrix to the console."""
for i in range(self.nc + 1): # 迴圈列印混淆矩陣的每一行
LOGGER.info(" ".join(map(str, self.matrix[i]))) # 將每一行轉換為字串並記錄到日誌中
def compute_ap(recall, precision):
"""
Compute the average precision (AP) given the recall and precision curves.
Args:
recall (list): The recall curve.
precision (list): The precision curve.
Returns:
(float): Average precision.
(np.ndarray): Precision envelope curve.
(np.ndarray): Modified recall curve with sentinel values added at the beginning and end.
"""
# Append sentinel values to beginning and end
mrec = np.concatenate(([0.0], recall, [1.0]))
mpre = np.concatenate(([1.0], precision, [0.0]))
# Compute the precision envelope
mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
# Integrate area under curve
# 計算曲線下面積,使用梯形法則
ap = np.sum(np.diff(mrec) * mpre[:-1])
return ap, mpre, mrec
method = "interp" # 定義變數 method,並賦值為 "interp",表示採用插值法計算平均精度
if method == "interp":
x = np.linspace(0, 1, 101) # 在 [0, 1] 區間生成101個均勻間隔的點,用於插值計算 (COCO)
ap = np.trapz(np.interp(x, mrec, mpre), x) # 使用梯形法則計算插值後的曲線下面積,得到平均精度
else: # 如果 method 不是 "interp",則執行 'continuous' 分支
i = np.where(mrec[1:] != mrec[:-1])[0] # 找到 mrec 中 recall 值發生變化的索引位置
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # 計算曲線下面積,得到平均精度
return ap, mpre, mrec # 返回計算得到的平均精度 ap,以及修改後的 mpre 和 mrec
# 根據物件置信度降序排列索引
i = np.argsort(-conf)
# 按照排序後的順序重新排列 tp, conf, pred_cls
tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
# 找出唯一的類別和它們的數量
unique_classes, nt = np.unique(target_cls, return_counts=True)
nc = unique_classes.shape[0] # 類別的數量,也是檢測的數量
# 建立 Precision-Recall 曲線並計算每個類別的平均精度 (AP)
x, prec_values = np.linspace(0, 1, 1000), []
# 初始化儲存平均精度 (AP),精度 (Precision),和召回率 (Recall) 曲線的陣列
ap, p_curve, r_curve = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000))
for ci, c in enumerate(unique_classes):
# ci 是類別 c 在 unique_classes 中的索引,c 是當前類別的值
i = pred_cls == c
# 計算預測類別為 c 的樣本數
n_l = nt[ci] # number of labels
# 計算真實類別為 c 的樣本數
n_p = i.sum() # number of predictions
# 如果沒有預測類別為 c 的樣本或者真實類別為 c 的樣本,則跳過
if n_p == 0 or n_l == 0:
continue
# 累積計算假陽性和真陽性
fpc = (1 - tp[i]).cumsum(0)
tpc = tp[i].cumsum(0)
# 計算召回率
recall = tpc / (n_l + eps) # recall curve
# 在負向 x 上插值,以生成召回率曲線
r_curve[ci] = np.interp(-x, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases
# 計算精確率
precision = tpc / (tpc + fpc) # precision curve
# 在負向 x 上插值,以生成精確率曲線
p_curve[ci] = np.interp(-x, -conf[i], precision[:, 0], left=1) # p at pr_score
# 從召回率-精確率曲線計算平均準確率
for j in range(tp.shape[1]):
ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
# 如果需要繪圖並且是第一個類別,記錄在 mAP@0.5 處的精確率值
if plot and j == 0:
prec_values.append(np.interp(x, mrec, mpre)) # precision at mAP@0.5
prec_values = np.array(prec_values) # (nc, 1000)
# 計算 F1 值(精確率和召回率的調和平均數)
f1_curve = 2 * p_curve * r_curve / (p_curve + r_curve + eps)
# 僅保留有資料的類別名稱列表
names = [v for k, v in names.items() if k in unique_classes] # list: only classes that have data
names = dict(enumerate(names)) # 轉換為字典形式
# 如果需要繪圖,則繪製精確率-召回率曲線、F1 曲線、精確率曲線、召回率曲線
if plot:
plot_pr_curve(x, prec_values, ap, save_dir / f"{prefix}PR_curve.png", names, on_plot=on_plot)
plot_mc_curve(x, f1_curve, save_dir / f"{prefix}F1_curve.png", names, ylabel="F1", on_plot=on_plot)
plot_mc_curve(x, p_curve, save_dir / f"{prefix}P_curve.png", names, ylabel="Precision", on_plot=on_plot)
plot_mc_curve(x, r_curve, save_dir / f"{prefix}R_curve.png", names, ylabel="Recall", on_plot=on_plot)
# 找到最大 F1 值所在的索引
i = smooth(f1_curve.mean(0), 0.1).argmax() # max F1 index
# 獲取最大 F1 值對應的精確率、召回率、F1 值
p, r, f1 = p_curve[:, i], r_curve[:, i], f1_curve[:, i] # max-F1 precision, recall, F1 values
# 計算真正例(TP)
tp = (r * nt).round() # true positives
# 計算假正例(FP)
fp = (tp / (p + eps) - tp).round() # false positives
# 返回結果:TP、FP、精確率、召回率、F1 值、平均準確率、唯一類別、精確率曲線、召回率曲線、F1 曲線、x 值、精確率值
return tp, fp, p, r, f1, ap, unique_classes.astype(int), p_curve, r_curve, f1_curve, x, prec_values
class Metric(SimpleClass):
"""
Class for computing evaluation metrics for YOLOv8 model.
Attributes:
p (list): Precision for each class. Shape: (nc,).
r (list): Recall for each class. Shape: (nc,).
f1 (list): F1 score for each class. Shape: (nc,).
all_ap (list): AP scores for all classes and all IoU thresholds. Shape: (nc, 10).
ap_class_index (list): Index of class for each AP score. Shape: (nc,).
nc (int): Number of classes.
Methods:
ap50(): AP at IoU threshold of 0.5 for all classes. Returns: List of AP scores. Shape: (nc,) or [].
ap(): AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: List of AP scores. Shape: (nc,) or [].
mp(): Mean precision of all classes. Returns: Float.
mr(): Mean recall of all classes. Returns: Float.
map50(): Mean AP at IoU threshold of 0.5 for all classes. Returns: Float.
map75(): Mean AP at IoU threshold of 0.75 for all classes. Returns: Float.
map(): Mean AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: Float.
mean_results(): Mean of results, returns mp, mr, map50, map.
class_result(i): Class-aware result, returns p[i], r[i], ap50[i], ap[i].
maps(): mAP of each class. Returns: Array of mAP scores, shape: (nc,).
fitness(): Model fitness as a weighted combination of metrics. Returns: Float.
update(results): Update metric attributes with new evaluation results.
"""
def __init__(self) -> None:
"""Initializes a Metric instance for computing evaluation metrics for the YOLOv8 model."""
self.p = [] # Precision for each class, initialized as an empty list
self.r = [] # Recall for each class, initialized as an empty list
self.f1 = [] # F1 score for each class, initialized as an empty list
self.all_ap = [] # AP scores for all classes and IoU thresholds, initialized as an empty list
self.ap_class_index = [] # Index of class for each AP score, initialized as an empty list
self.nc = 0 # Number of classes, initialized to 0
@property
def ap50(self):
"""
Returns the Average Precision (AP) at an IoU threshold of 0.5 for all classes.
Returns:
(np.ndarray, list): Array of shape (nc,) with AP50 values per class, or an empty list if not available.
"""
return self.all_ap[:, 0] if len(self.all_ap) else [] # Return AP50 values if all_ap is not empty, otherwise an empty list
@property
def ap(self):
"""
Returns the Average Precision (AP) at IoU thresholds from 0.5 to 0.95 for all classes.
Returns:
(np.ndarray, list): Array of shape (nc,) with mean AP values per class, or an empty list if not available.
"""
return self.all_ap.mean(1) if len(self.all_ap) else [] # Return mean AP values across IoU thresholds if all_ap is not empty, otherwise an empty list
@property
def mp(self):
"""
Returns the Mean Precision of all classes.
Returns:
(float): The mean precision of all classes.
"""
return self.p.mean() if len(self.p) else 0.0 # Return the mean precision of classes if p is not empty, otherwise 0.0
@property
def mr(self):
"""
Returns the Mean Recall of all classes.
Returns:
(float): The mean recall of all classes.
"""
return self.r.mean() if len(self.r) else 0.0 # Return the mean recall of classes if r is not empty, otherwise 0.0
def map50(self):
"""
Returns the mean Average Precision (mAP) at an IoU threshold of 0.5.
Returns:
(float): The mAP at an IoU threshold of 0.5.
"""
return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0
@property
def map75(self):
"""
Returns the mean Average Precision (mAP) at an IoU threshold of 0.75.
Returns:
(float): The mAP at an IoU threshold of 0.75.
"""
return self.all_ap[:, 5].mean() if len(self.all_ap) else 0.0
@property
def map(self):
"""
Returns the mean Average Precision (mAP) over IoU thresholds of 0.5 - 0.95 in steps of 0.05.
Returns:
(float): The mAP over IoU thresholds of 0.5 - 0.95 in steps of 0.05.
"""
return self.all_ap.mean() if len(self.all_ap) else 0.0
def mean_results(self):
"""Mean of results, return mp, mr, map50, map."""
return [self.mp, self.mr, self.map50, self.map]
def class_result(self, i):
"""Class-aware result, return p[i], r[i], ap50[i], ap[i]."""
return self.p[i], self.r[i], self.ap50[i], self.ap[i]
@property
def maps(self):
"""MAP of each class."""
maps = np.zeros(self.nc) + self.map
for i, c in enumerate(self.ap_class_index):
maps[c] = self.ap[i]
return maps
def fitness(self):
"""Model fitness as a weighted combination of metrics."""
w = [0.0, 0.0, 0.1, 0.9] # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
return (np.array(self.mean_results()) * w).sum()
def update(self, results):
"""
Updates the evaluation metrics of the model with a new set of results.
Args:
results (tuple): A tuple containing the following evaluation metrics:
- p (list): Precision for each class. Shape: (nc,).
- r (list): Recall for each class. Shape: (nc,).
- f1 (list): F1 score for each class. Shape: (nc,).
- all_ap (list): AP scores for all classes and all IoU thresholds. Shape: (nc, 10).
- ap_class_index (list): Index of class for each AP score. Shape: (nc,).
Side Effects:
Updates the class attributes `self.p`, `self.r`, `self.f1`, `self.all_ap`, and `self.ap_class_index` based
on the values provided in the `results` tuple.
"""
(
self.p,
self.r,
self.f1,
self.all_ap,
self.ap_class_index,
self.p_curve,
self.r_curve,
self.f1_curve,
self.px,
self.prec_values,
) = results
@property
def curves(self):
"""Returns a list of curves for accessing specific metrics curves."""
return []
@property
def pr_curves(self):
"""Returns precision and recall curves."""
return self.p_curve, self.r_curve
@property
def f1_curves(self):
"""Returns F1 score curves."""
return self.f1_curve
@property
def precision_values(self):
"""Returns precision values for the PR curve."""
return self.px, self.prec_values
@property
def pr_values(self):
"""Returns precision and recall values."""
return self.p, self.r
@property
def f1_values(self):
"""Returns F1 values."""
return self.f1
@property
def pr(self):
"""Returns precision and recall."""
return self.p, self.r
@property
def f1(self):
"""Returns F1 score."""
return self.f1
def print_results(self):
"""Prints results (p, r, ap50, ap)."""
print(self.p, self.r, self.ap50, self.ap)
def evaluation(self):
"""Model evaluation with metric AP."""
return self.ap
def result(self):
"""Return p, r, ap50, ap."""
return self.p, self.r, self.ap50, self.ap
@property
def recall(self):
"""Returns recall."""
return self.r
@property
def mean(self):
"""Returns the mean AP."""
return self.ap.mean()
@property
def mapss(self):
"""Returns mAP of each class."""
maps = np.zeros(self.nc) + self.map
for i, c in enumerate(self.ap_class_index):
maps[c] = self.ap[i]
return maps
@property
def model(self):
"""Returns the model."""
return self.model
def curves_results(self):
"""Returns a list of curves for accessing specific metrics curves."""
返回一個包含多個曲線的列表,用於訪問特定的度量曲線。
return [
[self.px, self.prec_values, "Recall", "Precision"],
# 返回包含 Precision 和 Recall 曲線的列表,使用 self.px 作為 x 軸,self.prec_values 作為 y 軸
[self.px, self.f1_curve, "Confidence", "F1"],
# 返回包含 F1 曲線的列表,使用 self.px 作為 x 軸,self.f1_curve 作為 y 軸
[self.px, self.p_curve, "Confidence", "Precision"],
# 返回包含 Precision 曲線的列表,使用 self.px 作為 x 軸,self.p_curve 作為 y 軸
[self.px, self.r_curve, "Confidence", "Recall"],
# 返回包含 Recall 曲線的列表,使用 self.px 作為 x 軸,self.r_curve 作為 y 軸
]
class DetMetrics(SimpleClass):
"""
This class is a utility class for computing detection metrics such as precision, recall, and mean average precision
(mAP) of an object detection model.
Args:
save_dir (Path): A path to the directory where the output plots will be saved. Defaults to current directory.
plot (bool): A flag that indicates whether to plot precision-recall curves for each class. Defaults to False.
on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None.
names (tuple of str): A tuple of strings that represents the names of the classes. Defaults to an empty tuple.
Attributes:
save_dir (Path): A path to the directory where the output plots will be saved.
plot (bool): A flag that indicates whether to plot the precision-recall curves for each class.
on_plot (func): An optional callback to pass plots path and data when they are rendered.
names (tuple of str): A tuple of strings that represents the names of the classes.
box (Metric): An instance of the Metric class for storing the results of the detection metrics.
speed (dict): A dictionary for storing the execution time of different parts of the detection process.
Methods:
process(tp, conf, pred_cls, target_cls): Updates the metric results with the latest batch of predictions.
keys: Returns a list of keys for accessing the computed detection metrics.
mean_results: Returns a list of mean values for the computed detection metrics.
class_result(i): Returns a list of values for the computed detection metrics for a specific class.
maps: Returns a dictionary of mean average precision (mAP) values for different IoU thresholds.
fitness: Computes the fitness score based on the computed detection metrics.
ap_class_index: Returns a list of class indices sorted by their average precision (AP) values.
results_dict: Returns a dictionary that maps detection metric keys to their computed values.
curves: TODO
curves_results: TODO
"""
def __init__(self, save_dir=Path("."), plot=False, on_plot=None, names=()) -> None:
"""
Initialize a DetMetrics instance with a save directory, plot flag, callback function, and class names.
"""
# 設定儲存輸出圖表的目錄路徑,預設為當前目錄
self.save_dir = save_dir
# 是否繪製每個類別的精度-召回率曲線的標誌,預設為 False
self.plot = plot
# 可選的回撥函式,用於在繪製完成時傳遞圖表路徑和資料,預設為 None
self.on_plot = on_plot
# 類別名稱的元組,表示檢測模型所涉及的類別名稱,預設為空元組
self.names = names
# Metric 類的例項,用於儲存檢測指標的結果
self.box = Metric()
# 儲存檢測過程中不同部分執行時間的字典
self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
# 任務型別,這裡為檢測任務
self.task = "detect"
@property
def keys(self):
"""Returns a list of keys for accessing specific metrics."""
# 返回一個包含特定指標鍵的列表,用於訪問特定指標資料
return ["metrics/precision(B)", "metrics/recall(B)", "metrics/mAP50(B)", "metrics/mAP50-95(B)"]
def mean_results(self):
"""Calculate mean of detected objects & return precision, recall, mAP50, and mAP50-95."""
# 計算檢測到的物件的平均值,並返回精度、召回率、mAP50 和 mAP50-95
return self.box.mean_results()
def class_result(self, i):
"""Return the result of evaluating the performance of an object detection model on a specific class."""
# 返回評估特定類別物件檢測模型效能的結果
return self.box.class_result(i)
@property
def maps(self):
"""Returns mean Average Precision (mAP) scores per class."""
# 返回每個類別的平均精度 (mAP) 分數
return self.box.maps
@property
def fitness(self):
"""Returns the fitness of box object."""
# 返回盒子物件的適應性(健壯性)
return self.box.fitness()
@property
def ap_class_index(self):
"""Returns the average precision index per class."""
# 返回每個類別的平均精度指數
return self.box.ap_class_index
@property
def results_dict(self):
"""Returns dictionary of computed performance metrics and statistics."""
# 返回計算的效能指標和統計資料的字典
return dict(zip(self.keys + ["fitness"], self.mean_results() + [self.fitness]))
@property
def curves(self):
"""Returns a list of curves for accessing specific metrics curves."""
# 返回用於訪問特定指標曲線的曲線列表
return ["Precision-Recall(B)", "F1-Confidence(B)", "Precision-Confidence(B)", "Recall-Confidence(B)"]
@property
def curves_results(self):
"""Returns dictionary of computed performance metrics and statistics."""
# 返回計算的效能指標和統計資料的字典
return self.box.curves_results
# SegmentMetrics 類,繼承自 SimpleClass,用於計算和聚合給定類別集合上的檢測和分割指標。
class SegmentMetrics(SimpleClass):
"""
Calculates and aggregates detection and segmentation metrics over a given set of classes.
Args:
save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory.
plot (bool): Whether to save the detection and segmentation plots. Default is False.
on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None.
names (list): List of class names. Default is an empty list.
Attributes:
save_dir (Path): Path to the directory where the output plots should be saved.
plot (bool): Whether to save the detection and segmentation plots.
on_plot (func): An optional callback to pass plots path and data when they are rendered.
names (list): List of class names.
box (Metric): An instance of the Metric class to calculate box detection metrics.
seg (Metric): An instance of the Metric class to calculate mask segmentation metrics.
speed (dict): Dictionary to store the time taken in different phases of inference.
Methods:
process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions.
mean_results(): Returns the mean of the detection and segmentation metrics over all the classes.
class_result(i): Returns the detection and segmentation metrics of class `i`.
maps: Returns the mean Average Precision (mAP) scores for IoU thresholds ranging from 0.50 to 0.95.
fitness: Returns the fitness scores, which are a single weighted combination of metrics.
ap_class_index: Returns the list of indices of classes used to compute Average Precision (AP).
results_dict: Returns the dictionary containing all the detection and segmentation metrics and fitness score.
"""
def __init__(self, save_dir=Path("."), plot=False, on_plot=None, names=()) -> None:
"""Initialize a SegmentMetrics instance with a save directory, plot flag, callback function, and class names."""
# 初始化儲存結果影像的目錄路徑
self.save_dir = save_dir
# 是否儲存檢測和分割影像的標誌
self.plot = plot
# 可選的回撥函式,用於在影像渲染時傳遞影像路徑和資料
self.on_plot = on_plot
# 類別名稱列表
self.names = names
# Metric 類的例項,用於計算盒子檢測指標
self.box = Metric()
# Metric 類的例項,用於計算分割掩碼指標
self.seg = Metric()
# 儲存不同推理階段時間消耗的字典
self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
# 任務型別,標識為 "segment"
self.task = "segment"
@property
def keys(self):
"""Returns a list of keys for accessing metrics."""
# 返回用於訪問指標的鍵列表,用於物件檢測和語義分割模型的評估
return [
"metrics/precision(B)", # 精度(Bounding Box)
"metrics/recall(B)", # 召回率(Bounding Box)
"metrics/mAP50(B)", # 平均精度 (mAP) @ IoU 50% (Bounding Box)
"metrics/mAP50-95(B)", # 平均精度 (mAP) @ IoU 50%-95% (Bounding Box)
"metrics/precision(M)", # 精度(Mask)
"metrics/recall(M)", # 召回率(Mask)
"metrics/mAP50(M)", # 平均精度 (mAP) @ IoU 50% (Mask)
"metrics/mAP50-95(M)", # 平均精度 (mAP) @ IoU 50%-95% (Mask)
]
def mean_results(self):
"""Return the mean metrics for bounding box and segmentation results."""
# 返回邊界框和分割結果的平均指標
return self.box.mean_results() + self.seg.mean_results()
def class_result(self, i):
"""Returns classification results for a specified class index."""
# 返回指定類索引的分類結果
return self.box.class_result(i) + self.seg.class_result(i)
@property
def maps(self):
"""Returns mAP scores for object detection and semantic segmentation models."""
# 返回物件檢測和語義分割模型的 mAP 分數
return self.box.maps + self.seg.maps
@property
def fitness(self):
"""Get the fitness score for both segmentation and bounding box models."""
# 獲取分割和邊界框模型的適應性分數
return self.seg.fitness() + self.box.fitness()
@property
def ap_class_index(self):
"""Boxes and masks have the same ap_class_index."""
# 邊界框和掩膜具有相同的 ap_class_index
return self.box.ap_class_index
@property
def results_dict(self):
"""Returns results of object detection model for evaluation."""
# 返回物件檢測模型的評估結果
return dict(zip(self.keys + ["fitness"], self.mean_results() + [self.fitness]))
@property
# 返回一個包含特定度量曲線的列表,用於訪問特定度量曲線。
def curves(self):
"""Returns a list of curves for accessing specific metrics curves."""
return [
"Precision-Recall(B)", # 精確率-召回率(B)
"F1-Confidence(B)", # F1-置信度(B)
"Precision-Confidence(B)", # 精確率-置信度(B)
"Recall-Confidence(B)", # 召回率-置信度(B)
"Precision-Recall(M)", # 精確率-召回率(M)
"F1-Confidence(M)", # F1-置信度(M)
"Precision-Confidence(M)", # 精確率-置信度(M)
"Recall-Confidence(M)", # 召回率-置信度(M)
]
@property
# 返回一個包含計算的效能指標和統計資料的字典。
def curves_results(self):
"""Returns dictionary of computed performance metrics and statistics."""
return self.box.curves_results + self.seg.curves_results
class PoseMetrics(SegmentMetrics):
"""
Calculates and aggregates detection and pose metrics over a given set of classes.
Args:
save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory.
plot (bool): Whether to save the detection and segmentation plots. Default is False.
on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None.
names (list): List of class names. Default is an empty list.
Attributes:
save_dir (Path): Path to the directory where the output plots should be saved.
plot (bool): Whether to save the detection and segmentation plots.
on_plot (func): An optional callback to pass plots path and data when they are rendered.
names (list): List of class names.
box (Metric): An instance of the Metric class to calculate box detection metrics.
pose (Metric): An instance of the Metric class to calculate mask segmentation metrics.
speed (dict): Dictionary to store the time taken in different phases of inference.
Methods:
process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions.
mean_results(): Returns the mean of the detection and segmentation metrics over all the classes.
class_result(i): Returns the detection and segmentation metrics of class `i`.
maps: Returns the mean Average Precision (mAP) scores for IoU thresholds ranging from 0.50 to 0.95.
fitness: Returns the fitness scores, which are a single weighted combination of metrics.
ap_class_index: Returns the list of indices of classes used to compute Average Precision (AP).
results_dict: Returns the dictionary containing all the detection and segmentation metrics and fitness score.
"""
def __init__(self, save_dir=Path("."), plot=False, on_plot=None, names=()) -> None:
"""Initialize the PoseMetrics class with directory path, class names, and plotting options."""
# 呼叫父類的初始化方法,初始化基礎類SegmentMetrics的屬性
super().__init__(save_dir, plot, names)
# 設定例項屬性:儲存輸出圖表的目錄路徑
self.save_dir = save_dir
# 設定例項屬性:是否儲存檢測和分割圖表的標誌
self.plot = plot
# 設定例項屬性:用於在渲染時傳遞圖表路徑和資料的回撥函式
self.on_plot = on_plot
# 設定例項屬性:類名列表
self.names = names
# 設定例項屬性:用於計算框檢測指標的Metric類例項
self.box = Metric()
# 設定例項屬性:用於計算姿勢分割指標的Metric類例項
self.pose = Metric()
# 設定例項屬性:儲存推斷不同階段所花費時間的字典
self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
# 設定例項屬性:任務型別為姿勢估計
self.task = "pose"
def process(self, tp, tp_p, conf, pred_cls, target_cls):
"""
Processes the detection and pose metrics over the given set of predictions.
Args:
tp (list): List of True Positive boxes.
tp_p (list): List of True Positive keypoints.
conf (list): List of confidence scores.
pred_cls (list): List of predicted classes.
target_cls (list): List of target classes.
"""
# Calculate pose metrics per class and update PoseEvaluator
results_pose = ap_per_class(
tp_p,
conf,
pred_cls,
target_cls,
plot=self.plot,
on_plot=self.on_plot,
save_dir=self.save_dir,
names=self.names,
prefix="Pose",
)[2:]
# Set the number of classes for pose evaluation
self.pose.nc = len(self.names)
# Update pose metrics with calculated results
self.pose.update(results_pose)
# Calculate box metrics per class and update BoxEvaluator
results_box = ap_per_class(
tp,
conf,
pred_cls,
target_cls,
plot=self.plot,
on_plot=self.on_plot,
save_dir=self.save_dir,
names=self.names,
prefix="Box",
)[2:]
# Set the number of classes for box evaluation
self.box.nc = len(self.names)
# Update box metrics with calculated results
self.box.update(results_box)
@property
def keys(self):
"""Returns list of evaluation metric keys."""
return [
"metrics/precision(B)",
"metrics/recall(B)",
"metrics/mAP50(B)",
"metrics/mAP50-95(B)",
"metrics/precision(P)",
"metrics/recall(P)",
"metrics/mAP50(P)",
"metrics/mAP50-95(P)",
]
def mean_results(self):
"""Return the mean results of box and pose."""
# Return mean results of both box and pose evaluations
return self.box.mean_results() + self.pose.mean_results()
def class_result(self, i):
"""Return the class-wise detection results for a specific class i."""
# Return class-wise detection results for class i from both box and pose evaluations
return self.box.class_result(i) + self.pose.class_result(i)
@property
def maps(self):
"""Returns the mean average precision (mAP) per class for both box and pose detections."""
# Return mean average precision (mAP) per class for both box and pose detections
return self.box.maps + self.pose.maps
@property
def fitness(self):
"""Computes classification metrics and speed using the `targets` and `pred` inputs."""
# Compute classification metrics and speed using the `targets` and `pred` inputs for both box and pose
return self.pose.fitness() + self.box.fitness()
@property
def curves(self):
"""Returns a list of curves for accessing specific metrics curves."""
# Return a list of curves for accessing specific metrics curves
return [
"Precision-Recall(B)",
"F1-Confidence(B)",
"Precision-Confidence(B)",
"Recall-Confidence(B)",
"Precision-Recall(P)",
"F1-Confidence(P)",
"Precision-Confidence(P)",
"Recall-Confidence(P)",
]
@property
def curves_results(self):
"""Returns dictionary of computed performance metrics and statistics."""
# Return dictionary of computed performance metrics and statistics for both box and pose
return self.box.curves_results + self.pose.curves_results
class ClassifyMetrics(SimpleClass):
"""
Class for computing classification metrics including top-1 and top-5 accuracy.
Attributes:
top1 (float): The top-1 accuracy.
top5 (float): The top-5 accuracy.
speed (Dict[str, float]): A dictionary containing the time taken for each step in the pipeline.
fitness (float): The fitness of the model, which is equal to top-5 accuracy.
results_dict (Dict[str, Union[float, str]]): A dictionary containing the classification metrics and fitness.
keys (List[str]): A list of keys for the results_dict.
Methods:
process(targets, pred): Processes the targets and predictions to compute classification metrics.
"""
def __init__(self) -> None:
"""Initialize a ClassifyMetrics instance."""
# 初始化 top1 和 top5 精度為 0
self.top1 = 0
self.top5 = 0
# 初始化速度字典,包含各個步驟的時間,初始值都為 0.0
self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
# 設定任務型別為分類
self.task = "classify"
def process(self, targets, pred):
"""Target classes and predicted classes."""
# 合併預測結果和目標類別,以便計算準確率
pred, targets = torch.cat(pred), torch.cat(targets)
# 計算每個樣本的正確性
correct = (targets[:, None] == pred).float()
# 計算 top-1 和 top-5 精度
acc = torch.stack((correct[:, 0], correct.max(1).values), dim=1) # (top1, top5) accuracy
self.top1, self.top5 = acc.mean(0).tolist()
@property
def fitness(self):
"""Returns mean of top-1 and top-5 accuracies as fitness score."""
# 計算並返回 top-1 和 top-5 精度的平均值作為 fitness 分數
return (self.top1 + self.top5) / 2
@property
def results_dict(self):
"""Returns a dictionary with model's performance metrics and fitness score."""
# 返回包含模型效能指標和 fitness 分數的字典
return dict(zip(self.keys + ["fitness"], [self.top1, self.top5, self.fitness]))
@property
def keys(self):
"""Returns a list of keys for the results_dict property."""
# 返回結果字典中的鍵列表
return ["metrics/accuracy_top1", "metrics/accuracy_top5"]
@property
def curves(self):
"""Returns a list of curves for accessing specific metrics curves."""
# 返回一個空列表,用於訪問特定的度量曲線
return []
@property
def curves_results(self):
"""Returns a list of curves for accessing specific metrics curves."""
# 返回一個空列表,用於訪問特定的度量曲線
return []
class OBBMetrics(SimpleClass):
"""Metrics for evaluating oriented bounding box (OBB) detection, see https://arxiv.org/pdf/2106.06072.pdf."""
def __init__(self, save_dir=Path("."), plot=False, on_plot=None, names=()) -> None:
"""Initialize an OBBMetrics instance with directory, plotting, callback, and class names."""
# 初始化 OBBMetrics 例項,包括儲存目錄、繪圖示誌、回撥函式和類名列表
self.save_dir = save_dir
self.plot = plot
self.on_plot = on_plot
self.names = names
# 初始化 Metric 型別的 box 屬性
self.box = Metric()
# 初始化速度字典,包含各個步驟的時間,初始值都為 0.0
self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
# 處理目標檢測的預測結果並更新指標
def process(self, tp, conf, pred_cls, target_cls):
"""Process predicted results for object detection and update metrics."""
# 呼叫 ap_per_class 函式計算每個類別的平均精度等指標,返回結果列表,去掉前兩個元素
results = ap_per_class(
tp,
conf,
pred_cls,
target_cls,
plot=self.plot, # 是否繪製結果的標誌
save_dir=self.save_dir, # 結果儲存目錄
names=self.names, # 類別名稱列表
on_plot=self.on_plot, # 是否在繪圖時處理結果的標誌
)[2:]
# 更新 self.box 物件的類別數
self.box.nc = len(self.names)
# 呼叫 self.box 物件的 update 方法,更新檢測結果
self.box.update(results)
@property
def keys(self):
"""Returns a list of keys for accessing specific metrics."""
# 返回用於訪問特定指標的鍵列表
return ["metrics/precision(B)", "metrics/recall(B)", "metrics/mAP50(B)", "metrics/mAP50-95(B)"]
def mean_results(self):
"""Calculate mean of detected objects & return precision, recall, mAP50, and mAP50-95."""
# 呼叫 self.box 物件的 mean_results 方法,計算檢測到的物體的平均指標,返回包含這些指標的列表
return self.box.mean_results()
def class_result(self, i):
"""Return the result of evaluating the performance of an object detection model on a specific class."""
# 呼叫 self.box 物件的 class_result 方法,返回指定類別 i 的效能評估結果
return self.box.class_result(i)
@property
def maps(self):
"""Returns mean Average Precision (mAP) scores per class."""
# 返回每個類別的平均精度 (mAP) 分數列表,由 self.box 物件的 maps 屬性提供
return self.box.maps
@property
def fitness(self):
"""Returns the fitness of box object."""
# 返回 self.box 物件的 fitness 方法計算的適應度值
return self.box.fitness()
@property
def ap_class_index(self):
"""Returns the average precision index per class."""
# 返回每個類別的平均精度索引,由 self.box 物件的 ap_class_index 屬性提供
return self.box.ap_class_index
@property
def results_dict(self):
"""Returns dictionary of computed performance metrics and statistics."""
# 返回計算的效能指標和統計資訊的字典,包括指標鍵列表和適應度值
return dict(zip(self.keys + ["fitness"], self.mean_results() + [self.fitness]))
@property
def curves(self):
"""Returns a list of curves for accessing specific metrics curves."""
# 返回一個曲線列表,用於訪問特定的指標曲線,這裡返回一個空列表
return []
@property
def curves_results(self):
"""Returns a list of curves for accessing specific metrics curves."""
# 返回一個曲線列表,用於訪問特定的指標曲線,這裡返回一個空列表
return []
.\yolov8\ultralytics\utils\ops.py
# Ultralytics YOLO 🚀, AGPL-3.0 license
import contextlib # 匯入上下文管理器相關的模組
import math # 匯入數學函式模組
import re # 匯入正規表示式模組
import time # 匯入時間模組
import cv2 # 匯入OpenCV庫
import numpy as np # 匯入NumPy庫
import torch # 匯入PyTorch庫
import torch.nn.functional as F # 匯入PyTorch的函式模組
from ultralytics.utils import LOGGER # 從ultralytics.utils中匯入LOGGER物件
from ultralytics.utils.metrics import batch_probiou # 從ultralytics.utils.metrics中匯入batch_probiou函式
class Profile(contextlib.ContextDecorator):
"""
YOLOv8 Profile class. Use as a decorator with @Profile() or as a context manager with 'with Profile():'.
Example:
```py
from ultralytics.utils.ops import Profile
with Profile(device=device) as dt:
pass # slow operation here
print(dt) # prints "Elapsed time is 9.5367431640625e-07 s"
```
"""
def __init__(self, t=0.0, device: torch.device = None):
"""
Initialize the Profile class.
Args:
t (float): Initial time. Defaults to 0.0.
device (torch.device): Devices used for model inference. Defaults to None (cpu).
"""
self.t = t # 初始化累計時間
self.device = device # 初始化裝置
self.cuda = bool(device and str(device).startswith("cuda")) # 檢查是否使用CUDA加速
def __enter__(self):
"""Start timing."""
self.start = self.time() # 記錄開始時間
return self
def __exit__(self, type, value, traceback): # noqa
"""Stop timing."""
self.dt = self.time() - self.start # 計算耗時
self.t += self.dt # 累加耗時到總時間
def __str__(self):
"""Returns a human-readable string representing the accumulated elapsed time in the profiler."""
return f"Elapsed time is {self.t} s" # 返回累計的耗時資訊
def time(self):
"""Get current time."""
if self.cuda:
torch.cuda.synchronize(self.device) # 同步CUDA流
return time.time() # 返回當前時間戳
def segment2box(segment, width=640, height=640):
"""
Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy).
Args:
segment (torch.Tensor): the segment label
width (int): the width of the image. Defaults to 640
height (int): The height of the image. Defaults to 640
Returns:
(np.ndarray): the minimum and maximum x and y values of the segment.
"""
x, y = segment.T # 提取segment的xy座標
inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height) # 內部約束條件
x = x[inside] # 過濾符合約束條件的x座標
y = y[inside] # 過濾符合約束條件的y座標
return (
np.array([x.min(), y.min(), x.max(), y.max()], dtype=segment.dtype)
if any(x)
else np.zeros(4, dtype=segment.dtype)
) # 返回segment的最小和最大xy座標,如果沒有符合條件的點則返回全零陣列
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True, xywh=False):
"""
Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
specified in (img1_shape) to the shape of a different image (img0_shape).
Args:
img1_shape (tuple): Shape of the original image (height, width).
boxes (torch.Tensor): Bounding boxes in format xyxy.
img0_shape (tuple): Shape of the new image (height, width).
ratio_pad (tuple): Aspect ratio and padding.
padding (bool): Whether to pad bounding boxes or not.
xywh (bool): Whether the boxes are in xywh format or not. Defaults to False.
"""
"""
Args:
img1_shape (tuple): 目標影像的形狀,格式為 (高度, 寬度)
boxes (torch.Tensor): 影像中物體的邊界框,格式為 (x1, y1, x2, y2)
img0_shape (tuple): 原始影像的形狀,格式為 (高度, 寬度)
ratio_pad (tuple): 一個元組 (ratio, pad),用於縮放邊界框。如果未提供,則根據兩個影像的大小差異計算 ratio 和 pad
padding (bool): 如果為 True,則假設邊界框基於 YOLO 樣式增強的影像。如果為 False,則進行常規的重新縮放
xywh (bool): 邊界框格式是否為 xywh, 預設為 False
Returns:
boxes (torch.Tensor): 縮放後的邊界框,格式為 (x1, y1, x2, y2)
"""
if ratio_pad is None: # 如果未提供 ratio_pad,則從 img0_shape 計算
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # 計算縮放比例 gain = 目標影像尺寸 / 原始影像尺寸
pad = (
round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), # 計算寬度方向的填充量
round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1), # 計算高度方向的填充量
)
else:
gain = ratio_pad[0][0] # 使用提供的 ratio_pad 中的縮放比例
pad = ratio_pad[1] # 使用提供的 ratio_pad 中的填充量
if padding:
boxes[..., 0] -= pad[0] # 減去 x 方向的填充量
boxes[..., 1] -= pad[1] # 減去 y 方向的填充量
if not xywh:
boxes[..., 2] -= pad[0] # 對於非 xywh 格式的邊界框,再次減去 x 方向的填充量
boxes[..., 3] -= pad[1] # 對於非 xywh 格式的邊界框,再次減去 y 方向的填充量
boxes[..., :4] /= gain # 縮放邊界框座標
return clip_boxes(boxes, img0_shape) # 呼叫 clip_boxes 函式,確保邊界框在影像內部
# 執行非極大值抑制(NMS)操作,用於一組邊界框,支援掩碼和每個框多個標籤。
def non_max_suppression(
prediction,
conf_thres=0.25, # 置信度閾值,低於此閾值的框將被忽略
iou_thres=0.45, # IoU(交併比)閾值,用於判斷重疊框之間是否合併
classes=None, # 類別列表,用於過濾特定類別的框
agnostic=False, # 是否忽略預測框的類別資訊
multi_label=False, # 是否支援多標籤輸出
labels=(), # 標籤列表,指定要保留的標籤
max_det=300, # 最大檢測框數
nc=0, # 類別數量(可選)
max_time_img=0.05, # 最大影像處理時間
max_nms=30000, # 最大NMS運算元
max_wh=7680, # 最大寬度和高度
in_place=True, # 是否就地修改
rotated=False, # 是否為旋轉框
):
"""
Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
"""
# 如果預測為空,返回一個空的numpy陣列
if len(prediction) == 0:
return np.empty((0,), dtype=np.int8)
# 根據置信度對預測框進行降序排序
sorted_idx = torch.argsort(prediction[:, 4], descending=True)
prediction = prediction[sorted_idx]
# 計算所有框兩兩之間的probiou得分矩陣,並取其上三角部分
ious = batch_probiou(prediction, prediction).triu_(diagonal=1)
# 根據IoU閾值進行非極大值抑制,保留符合條件的框索引
pick = torch.nonzero(ious.max(dim=0)[0] < iou_thres).squeeze(-1)
# 返回按照降序排列的被選框的索引
return sorted_idx[pick]
import torchvision # 引入torchvision模組,用於加快“import ultralytics”的速度
# 檢查置信度閾值的有效性,必須在0到1之間
assert 0 <= conf_thres <= 1, f"Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0"
# 檢查IoU閾值的有效性,必須在0到1之間
assert 0 <= iou_thres <= 1, f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0"
# 如果prediction是一個列表或元組(例如YOLOv8模型在驗證模式下的輸出),選擇推斷輸出部分
if isinstance(prediction, (list, tuple)):
prediction = prediction[0] # 選擇推斷輸出
# 如果指定了classes,則將其轉換為與prediction裝置相同的torch張量
if classes is not None:
classes = torch.tensor(classes, device=prediction.device)
# 如果prediction的最後一個維度為6,說明是端到端模型的輸出(BNC格式,即1,300,6)
if prediction.shape[-1] == 6:
# 對每個預測結果進行置信度閾值過濾
output = [pred[pred[:, 4] > conf_thres] for pred in prediction]
# 如果指定了classes,則進一步根據classes進行過濾
if classes is not None:
output = [pred[(pred[:, 5:6] == classes).any(1)] for pred in output]
return output
# 獲取batch size(BCN格式,即1,84,6300)
bs = prediction.shape[0]
# 如果未指定nc(類別數量),則根據prediction的形狀推斷類別數量
nc = nc or (prediction.shape[1] - 4) # number of classes
# 計算預測結果中的掩碼數量
nm = prediction.shape[1] - nc - 4 # number of masks
# 確定掩碼起始索引
mi = 4 + nc # mask start index
# 根據置信度閾值確定候選項
xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
# 設定時間限制
time_limit = 2.0 + max_time_img * bs # seconds to quit after
# 若多標籤設定為真,則每個框可能有多個標籤(增加0.5ms/影像)
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
# 調整預測結果的維度順序,將最後兩個維度互換
prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
# 如果不是旋轉框,根據需求將預測的邊界框格式從xywh轉換為xyxy
if not rotated:
if in_place:
prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy in-place modification
else:
# 在非原地操作時,將邊界框和其他預測結果連線起來,轉換為xyxy格式
prediction = torch.cat((xywh2xyxy(prediction[..., :4]), prediction[..., 4:]), dim=-1) # xywh to xyxy
# 記錄當前時間
t = time.time()
# 初始化輸出列表,每個元素都是一個空的張量,形狀為(0, 6 + nm),在指定裝置上
output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
for xi, x in enumerate(prediction): # 對每個預測結果進行遍歷,xi是索引,x是預測結果
# Apply constraints
# 應用約束條件
# x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
# 對預測結果中的寬度和高度進行約束,將不滿足條件的置為0
x = x[xc[xi]] # confidence
# 根據置信度索引獲取預測結果的子集
# Cat apriori labels if autolabelling
# 如果自動標註,合併先驗標籤
if labels and len(labels[xi]) and not rotated:
lb = labels[xi]
v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
x = torch.cat((x, v), 0)
# 將先驗標籤與預測結果合併,形成新的預測結果
# If none remain process next image
# 如果沒有剩餘的預測結果,則處理下一張影像
if not x.shape[0]:
continue
# Detections matrix nx6 (xyxy, conf, cls)
# 檢測矩陣,大小為nx6(xyxy座標,置信度,類別)
box, cls, mask = x.split((4, nc, nm), 1)
if multi_label:
i, j = torch.where(cls > conf_thres)
x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
# 如果支援多標籤,根據置信度閾值篩選類別,並形成新的預測結果
else: # best class only
conf, j = cls.max(1, keepdim=True)
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
# 否則,選擇最高置信度的類別作為預測結果
# Filter by class
# 根據類別進行過濾
if classes is not None:
x = x[(x[:, 5:6] == classes).any(1)]
# 如果指定了類別,只保留匹配指定類別的預測結果
# Check shape
# 檢查預測結果的形狀
n = x.shape[0] # number of boxes
# n為盒子(邊界框)的數量
if not n: # no boxes
continue
if n > max_nms: # excess boxes
x = x[x[:, 4].argsort(descending=True)[:max_nms]]
# 如果盒子數量超過設定的最大NMS數量,則按置信度排序並保留前max_nms個盒子
# Batched NMS
# 批處理的非極大值抑制(NMS)
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
scores = x[:, 4] # scores
if rotated:
boxes = torch.cat((x[:, :2] + c, x[:, 2:4], x[:, -1:]), dim=-1)
i = nms_rotated(boxes, scores, iou_thres)
# 如果啟用了旋轉NMS,對旋轉邊界框進行NMS處理
else:
boxes = x[:, :4] + c
i = torchvision.ops.nms(boxes, scores, iou_thres)
# 否則,對標準邊界框進行NMS處理
i = i[:max_det] # limit detections
# 限制最終的檢測結果數量
output[xi] = x[i]
# 將處理後的預測結果存入輸出中的對應位置
if (time.time() - t) > time_limit:
LOGGER.warning(f"WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded")
break # time limit exceeded
# 如果超過了NMS處理時間限制,記錄警告並跳出迴圈
return output
def scale_image(masks, im0_shape, ratio_pad=None):
"""
Takes a mask, and resizes it to the original image size.
Args:
masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3].
im0_shape (tuple): the original image shape
ratio_pad (tuple): the ratio of the padding to the original image.
Returns:
masks (np.ndarray): The masks that are being returned with shape [h, w, num].
"""
# 獲取當前 masks 的形狀
im1_shape = masks.shape
# 如果當前 masks 形狀與原始圖片形狀相同,則直接返回 masks,無需調整大小
if im1_shape[:2] == im0_shape[:2]:
return masks
# 如果未指定 ratio_pad,則根據 im0_shape 計算 gain 和 pad
if ratio_pad is None:
# 計算 gain,即縮放比例
gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1])
# 計算 padding 的寬度和高度
pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2
else:
pad = ratio_pad[1] # 使用指定的 ratio_pad 中的 padding 值
# 將 pad 轉換為整數,表示上、左、下、右的邊界
top, left = int(pad[1]), int(pad[0]) # y, x
bottom, right = int(im1_shape[0] - pad[1]), int(im1_shape[1] - pad[0])
# 如果 masks 的維度小於 2,則丟擲異常
if len(masks.shape) < 2:
raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
# 對 masks 進行裁剪,按照計算得到的邊界進行裁剪
masks = masks[top:bottom, left:right]
# 將裁剪後的 masks 調整大小至原始圖片大小
masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]))
# 檢查 masks 的維度是否為 2
if len(masks.shape) == 2:
# 如果是,新增一個額外的維度,使其變為三維
masks = masks[:, :, None]
# 返回處理後的 masks 變數
return masks
def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
"""
Convert bounding box coordinates from (x1, y1, x2, y2) format to normalized (x, y, width, height) format,
relative to image dimensions and optionally clip the values.
Args:
x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.
w (int): Width of the image. Defaults to 640.
h (int): Height of the image. Defaults to 640.
clip (bool): Whether to clip the normalized coordinates to [0, 1]. Defaults to False.
eps (float): Epsilon value for numerical stability. Defaults to 0.0.
Returns:
y (np.ndarray | torch.Tensor): The bounding box coordinates in normalized (x, y, width, height) format.
"""
assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
half_w = w / 2.0
half_h = h / 2.0
y[..., 0] = (x[..., 0] + x[..., 2]) / (2 * w) # center x normalized
y[..., 1] = (x[..., 1] + x[..., 3]) / (2 * h) # center y normalized
y[..., 2] = (x[..., 2] - x[..., 0]) / w # width normalized
y[..., 3] = (x[..., 3] - x[..., 1]) / h # height normalized
if clip:
y = torch.clamp(y, min=eps, max=1.0 - eps) if isinstance(y, torch.Tensor) else np.clip(y, eps, 1.0 - eps)
return y
# 將邊界框座標從 (x1, y1, x2, y2) 格式轉換為 (x, y, width, height, normalized) 格式。其中 x, y, width 和 height 均已歸一化至影像尺寸。
Args:
x (np.ndarray | torch.Tensor): 輸入的邊界框座標,格式為 (x1, y1, x2, y2)。
w (int): 影像的寬度。預設為 640。
h (int): 影像的高度。預設為 640。
clip (bool): 如果為 True,則將邊界框裁剪到影像邊界內。預設為 False。
eps (float): 邊界框寬度和高度的最小值。預設為 0.0。
Returns:
y (np.ndarray | torch.Tensor): 格式為 (x, y, width, height, normalized) 的邊界框座標。
"""
if clip:
# 呼叫 clip_boxes 函式,將邊界框 x 裁剪到影像邊界內,邊界為 (h - eps, w - eps)
x = clip_boxes(x, (h - eps, w - eps))
assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
# 根據輸入 x 的型別建立與之相同型別的空陣列 y,相比 clone/copy 操作更快
y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x)
# 計算 x 中每個邊界框的中心點 x 座標,並將其歸一化到影像寬度 w
y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w # x center
# 計算 x 中每個邊界框的中心點 y 座標,並將其歸一化到影像高度 h
y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h # y center
# 計算 x 中每個邊界框的寬度,並將其歸一化到影像寬度 w
y[..., 2] = (x[..., 2] - x[..., 0]) / w # width
# 計算 x 中每個邊界框的高度,並將其歸一化到影像高度 h
y[..., 3] = (x[..., 3] - x[..., 1]) / h # height
# 返回格式為 (x, y, width, height, normalized) 的邊界框座標 y
return y
def xywh2ltwh(x):
"""
將邊界框格式從 [x, y, w, h] 轉換為 [x1, y1, w, h],其中 x1, y1 是左上角座標。
Args:
x (np.ndarray | torch.Tensor): 輸入張量,包含 xywh 格式的邊界框座標
Returns:
y (np.ndarray | torch.Tensor): 輸出張量,包含 xyltwh 格式的邊界框座標
"""
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[..., 0] = x[..., 0] - x[..., 2] / 2 # 左上角 x 座標
y[..., 1] = x[..., 1] - x[..., 3] / 2 # 左上角 y 座標
return y
def xyxy2ltwh(x):
"""
將多個 [x1, y1, x2, y2] 格式的邊界框轉換為 [x1, y1, w, h] 格式,其中 xy1 是左上角,xy2 是右下角。
Args:
x (np.ndarray | torch.Tensor): 輸入張量,包含 xyxy 格式的邊界框座標
Returns:
y (np.ndarray | torch.Tensor): 輸出張量,包含 xyltwh 格式的邊界框座標
"""
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[..., 2] = x[..., 2] - x[..., 0] # 寬度
y[..., 3] = x[..., 3] - x[..., 1] # 高度
return y
def ltwh2xywh(x):
"""
將 [x1, y1, w, h] 格式的邊界框轉換為 [x, y, w, h] 格式,其中 xy1 是左上角,xy 是中心座標。
Args:
x (torch.Tensor): 輸入張量
Returns:
y (np.ndarray | torch.Tensor): 輸出張量,包含 xywh 格式的邊界框座標
"""
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[..., 0] = x[..., 0] + x[..., 2] / 2 # 中心 x 座標
y[..., 1] = x[..., 1] + x[..., 3] / 2 # 中心 y 座標
return y
def xyxyxyxy2xywhr(x):
"""
將批次的方向邊界框 (OBB) 從 [xy1, xy2, xy3, xy4] 格式轉換為 [cx, cy, w, h, rotation] 格式。
旋轉角度的範圍是從 0 到 90 度。
Args:
x (numpy.ndarray | torch.Tensor): 輸入的角點陣列 [xy1, xy2, xy3, xy4],形狀為 (n, 8)。
Returns:
(numpy.ndarray | torch.Tensor): 轉換後的資料,形狀為 (n, 5),包含 [cx, cy, w, h, rotation] 格式。
"""
is_torch = isinstance(x, torch.Tensor)
points = x.cpu().numpy() if is_torch else x
points = points.reshape(len(x), -1, 2)
rboxes = []
for pts in points:
# 注意: 使用 cv2.minAreaRect 來獲取準確的 xywhr 格式,
# 特別是當資料載入器中的一些物件因增強而被裁剪時。
(cx, cy), (w, h), angle = cv2.minAreaRect(pts)
rboxes.append([cx, cy, w, h, angle / 180 * np.pi])
return torch.tensor(rboxes, device=x.device, dtype=x.dtype) if is_torch else np.asarray(rboxes)
def xywhr2xyxyxyxy(x):
"""
將批次的方向邊界框 (OBB) 從 [cx, cy, w, h, rotation] 格式轉換為 [xy1, xy2, xy3, xy4] 格式。
旋轉角度的範圍應為 0 到 90 度。
Args:
x (numpy.ndarray | torch.Tensor): 輸入的角點陣列,形狀為 (n, 5) 或 (b, n, 5)。
Returns:
(numpy.ndarray | torch.Tensor): 轉換後的角點陣列,形狀為 (n, 4, 2) 或 (b, n, 4, 2)。
"""
# 這個函式沒有實現主體部分,因此不需要新增註釋。
pass
# 根據輸入的張量型別選擇對應的數學函式庫
cos, sin, cat, stack = (
(torch.cos, torch.sin, torch.cat, torch.stack)
if isinstance(x, torch.Tensor)
else (np.cos, np.sin, np.concatenate, np.stack)
)
# 提取張量 x 的中心座標
ctr = x[..., :2]
# 提取張量 x 的寬度、高度和角度資訊
w, h, angle = (x[..., i : i + 1] for i in range(2, 5))
# 計算角度的餘弦和正弦值
cos_value, sin_value = cos(angle), sin(angle)
# 計算第一個向量 vec1
vec1 = [w / 2 * cos_value, w / 2 * sin_value]
# 計算第二個向量 vec2
vec2 = [-h / 2 * sin_value, h / 2 * cos_value]
# 合併向量 vec1 的兩個分量
vec1 = cat(vec1, -1)
# 合併向量 vec2 的兩個分量
vec2 = cat(vec2, -1)
# 計算矩形的四個頂點
pt1 = ctr + vec1 + vec2
pt2 = ctr + vec1 - vec2
pt3 = ctr - vec1 - vec2
pt4 = ctr - vec1 + vec2
# 將四個頂點按行堆疊形成新的張量,並沿著倒數第二個維度堆疊
return stack([pt1, pt2, pt3, pt4], -2)
def ltwh2xyxy(x):
"""
將邊界框從[x1, y1, w, h]轉換為[x1, y1, x2, y2],其中xy1為左上角,xy2為右下角。
Args:
x (np.ndarray | torch.Tensor): 輸入的影像或張量
Returns:
y (np.ndarray | torch.Tensor): 邊界框的xyxy座標
"""
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[..., 2] = x[..., 2] + x[..., 0] # 計算寬度
y[..., 3] = x[..., 3] + x[..., 1] # 計算高度
return y
def segments2boxes(segments):
"""
將分段標籤轉換為框標籤,即(cls, xy1, xy2, ...)轉換為(cls, xywh)
Args:
segments (list): 分段列表,每個分段是一個點列表,每個點是一個包含x, y座標的列表
Returns:
(np.ndarray): 邊界框的xywh座標
"""
boxes = []
for s in segments:
x, y = s.T # 提取分段的xy座標
boxes.append([x.min(), y.min(), x.max(), y.max()]) # 計算xyxy座標
return xyxy2xywh(np.array(boxes)) # 轉換為xywh座標
def resample_segments(segments, n=1000):
"""
將分段列表(samples,2)輸入並將其上取樣到每個n點的分段列表(samples,2)。
Args:
segments (list): 包含(samples,2)陣列的列表,其中samples是分段中的點數。
n (int): 要上取樣到的點數,預設為1000。
Returns:
segments (list): 上取樣後的分段列表。
"""
for i, s in enumerate(segments):
s = np.concatenate((s, s[0:1, :]), axis=0) # 首尾相接,閉合分段
x = np.linspace(0, len(s) - 1, n)
xp = np.arange(len(s))
segments[i] = (
np.concatenate([np.interp(x, xp, s[:, i]) for i in range(2)], dtype=np.float32).reshape(2, -1).T
) # 插值獲取上取樣點
return segments
def crop_mask(masks, boxes):
"""
根據邊界框裁剪掩模,並返回裁剪後的掩模。
Args:
masks (torch.Tensor): [n, h, w] 掩模張量
boxes (torch.Tensor): [n, 4] 相對點形式的邊界框座標
Returns:
(torch.Tensor): 裁剪後的掩模
"""
_, h, w = masks.shape
x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # 分離邊界框座標
r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # 行索引
c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # 列索引
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
def process_mask(protos, masks_in, bboxes, shape, upsample=False):
"""
使用掩模頭部的輸出,將掩模應用於邊界框。
Args:
protos: 未指定
masks_in (torch.Tensor): [n, h, w] 掩模張量
bboxes (torch.Tensor): [n, 4] 邊界框座標
shape: 未指定
upsample (bool): 是否上取樣,預設為False
Returns:
unspecified
"""
# 函式體未提供
pass
# 獲取 protos 張量的形狀資訊,分別賦值給 c, mh, mw
c, mh, mw = protos.shape # CHW
# 解構 shape 元組,獲取輸入影像的高度和寬度資訊,分別賦值給 ih, iw
ih, iw = shape
# 計算每個 mask 的輸出,透過 masks_in 與 protos 的矩陣乘法,再重新 reshape 成 [n, mh, mw] 的形狀
masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw) # CHW
# 計算寬度和高度的比率,用於將 bounding boxes 按比例縮放
width_ratio = mw / iw
height_ratio = mh / ih
# 複製 bounding boxes 張量,按照比率調整左上角和右下角的座標
downsampled_bboxes = bboxes.clone()
downsampled_bboxes[:, 0] *= width_ratio
downsampled_bboxes[:, 2] *= width_ratio
downsampled_bboxes[:, 3] *= height_ratio
downsampled_bboxes[:, 1] *= height_ratio
# 裁剪 masks,根據 downsampled_bboxes 中的邊界框資訊進行裁剪,輸出結果的形狀保持為 CHW
masks = crop_mask(masks, downsampled_bboxes) # CHW
# 如果 upsample 標誌為 True,則對 masks 進行雙線性插值,將其尺寸調整為 shape,最終形狀為 [1, h, w]
if upsample:
masks = F.interpolate(masks[None], shape, mode="bilinear", align_corners=False)[0] # CHW
# 返回 masks 張量中大於 0.0 的元素,即二值化後的二進位制 mask 張量,形狀為 [n, h, w]
return masks.gt_(0.0)
# 定義函式 process_mask_native,處理原生掩模的邏輯
def process_mask_native(protos, masks_in, bboxes, shape):
"""
It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
Args:
protos (torch.Tensor): [mask_dim, mask_h, mask_w],原型掩模的張量,形狀為 [掩模維度, 高度, 寬度]
masks_in (torch.Tensor): [n, mask_dim],經 NMS 後的掩模張量,形狀為 [n, 掩模維度],n 為經過 NMS 後的掩模數量
bboxes (torch.Tensor): [n, 4],經 NMS 後的邊界框張量,形狀為 [n, 4],n 為經過 NMS 後的掩模數量
shape (tuple): 輸入影像的尺寸 (高度, 寬度)
Returns:
masks (torch.Tensor): 處理後的掩模張量,形狀為 [高度, 寬度, n]
"""
c, mh, mw = protos.shape # 獲取原型掩模的通道數、高度、寬度
masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw) # 計算掩模張量,進行上取樣後裁剪到邊界框大小
masks = scale_masks(masks[None], shape)[0] # 對掩模進行尺寸縮放
masks = crop_mask(masks, bboxes) # 根據邊界框裁剪掩模
return masks.gt_(0.0) # 返回掩模張量,應用大於零的閾值處理
# 定義函式 scale_masks,將分段掩模尺寸縮放到指定形狀
def scale_masks(masks, shape, padding=True):
"""
Rescale segment masks to shape.
Args:
masks (torch.Tensor): (N, C, H, W),掩模張量,形狀為 (批次大小, 通道數, 高度, 寬度)
shape (tuple): 目標高度和寬度
padding (bool): 如果為 True,則假設邊界框基於 YOLO 樣式增強的影像。如果為 False,則進行常規尺寸縮放。
Returns:
masks (torch.Tensor): 縮放後的掩模張量
"""
mh, mw = masks.shape[2:] # 獲取掩模張量的高度和寬度
gain = min(mh / shape[0], mw / shape[1]) # 計算縮放比例 gain = 舊尺寸 / 新尺寸
pad = [mw - shape[1] * gain, mh - shape[0] * gain] # 計算高度和寬度的填充值
if padding:
pad[0] /= 2 # 寬度填充減半
pad[1] /= 2 # 高度填充減半
top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # 計算頂部和左側填充位置
bottom, right = (int(mh - pad[1]), int(mw - pad[0])) # 計算底部和右側填充位置
masks = masks[..., top:bottom, left:right] # 對掩模進行裁剪
masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # 使用雙線性插值對掩模進行尺寸縮放
return masks
# 定義函式 scale_coords,將影像 1 的分割座標縮放到影像 0 的尺寸
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize=False, padding=True):
"""
Rescale segment coordinates (xy) from img1_shape to img0_shape.
Args:
img1_shape (tuple): 座標所在影像的尺寸。
coords (torch.Tensor): 需要縮放的座標,形狀為 n,2。
img0_shape (tuple): 應用分割的目標影像的尺寸。
ratio_pad (tuple): 影像尺寸與填充影像尺寸的比例。
normalize (bool): 如果為 True,則將座標歸一化到 [0, 1] 範圍內。預設為 False。
padding (bool): 如果為 True,則假設邊界框基於 YOLO 樣式增強的影像。如果為 False,則進行常規尺寸縮放。
Returns:
coords (torch.Tensor): 縮放後的座標。
"""
if ratio_pad is None: # 如果沒有指定比例,則根據影像 0 的尺寸計算
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # 計算縮放比例 gain = 舊尺寸 / 新尺寸
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # 計算高度和寬度的填充值
else:
gain = ratio_pad[0][0] # 獲取填充比例的縮放增益
pad = ratio_pad[1] # 獲取填充值
if padding:
coords[..., 0] -= pad[0] # 減去 x 方向的填充值
coords[..., 1] -= pad[1] # 減去 y 方向的填充值
coords[..., 0] /= gain # 根據縮放增益進行 x 座標縮放
coords[..., 1] /= gain # 根據縮放增益進行 y 座標縮放
coords = clip_coords(coords, img0_shape) # 呼叫 clip_coords 函式對座標進行裁剪
# 如果 normalize 引數為 True,則進行座標歸一化處理
if normalize:
# 將所有座標點的 x 值除以影像寬度,實現 x 座標的歸一化
coords[..., 0] /= img0_shape[1] # width
# 將所有座標點的 y 值除以影像高度,實現 y 座標的歸一化
coords[..., 1] /= img0_shape[0] # height
# 返回歸一化後的座標陣列
return coords
# Regularize rotated boxes in range [0, pi/2].
def regularize_rboxes(rboxes):
x, y, w, h, t = rboxes.unbind(dim=-1)
# Swap edge and angle if h >= w
w_ = torch.where(w > h, w, h) # Determine the maximum edge length
h_ = torch.where(w > h, h, w) # Determine the minimum edge length
t = torch.where(w > h, t, t + math.pi / 2) % math.pi # Adjust angle if height is greater than width
return torch.stack([x, y, w_, h_, t], dim=-1) # Stack the regularized boxes
# It takes a list of masks(n,h,w) and returns a list of segments(n,xy)
def masks2segments(masks, strategy="largest"):
segments = []
for x in masks.int().cpu().numpy().astype("uint8"):
# Find contours in the mask image
c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
if c:
if strategy == "concat": # concatenate all segments
c = np.concatenate([x.reshape(-1, 2) for x in c])
elif strategy == "largest": # select largest segment
c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
else:
c = np.zeros((0, 2)) # no segments found
segments.append(c.astype("float32"))
return segments
# Convert a batch of FP32 torch tensors (0.0-1.0) to a NumPy uint8 array (0-255), changing from BCHW to BHWC layout.
def convert_torch2numpy_batch(batch: torch.Tensor) -> np.ndarray:
return (batch.permute(0, 2, 3, 1).contiguous() * 255).clamp(0, 255).to(torch.uint8).cpu().numpy()
# Cleans a string by replacing special characters with underscore _
def clean_str(s):
return re.sub(pattern="[|@#!¡·$€%&()=?¿^*;:,¨´><+]", repl="_", string=s)