In [ ]:
Config.py¶
In [ ]:
import torch
BATCH_SIZE = 8 # Increase / decrease according to GPU memeory.
RESIZE_TO = 640 # Resize the image for training and transforms.
NUM_EPOCHS = 30 # Number of epochs to train for. (60)
NUM_WORKERS = 2 # Number of parallel workers for data loading.(4)
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Training images and labels files directory.
TRAIN_DIR = "/content/data/train"
# Validation images and labels files directory.
VALID_DIR = "/content/data/valid"
# Classes: 0 index is reserved for background.
CLASSES = ["__background__", "buffalo", "elephant", "rhino", "zebra"]
NUM_CLASSES = len(CLASSES)
# Whether to visualize images after crearing the data loaders.
VISUALIZE_TRANSFORMED_IMAGES = True
# Location to save model and plots.
OUT_DIR = "/content/outputs"
custom_utils.py¶
In [ ]:
import albumentations as A
import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow
from albumentations.pytorch import ToTensorV2
#from config import DEVICE, CLASSES, BATCH_SIZE
OUT_DIR = "/content/outputs"
plt.style.use("ggplot")
class Averager:
"""
A class to keep track of running average of values (e.g. training loss).
"""
def __init__(self):
self.current_total = 0.0
self.iterations = 0.0
def send(self, value):
self.current_total += value
self.iterations += 1
@property
def value(self):
if self.iterations == 0:
return 0
else:
return self.current_total / self.iterations
def reset(self):
self.current_total = 0.0
self.iterations = 0.0
class SaveBestModel:
"""
Saves the model if the current epoch's validation mAP is higher
than all previously observed values.
"""
def __init__(self, best_valid_map=float(0)):
self.best_valid_map = best_valid_map
def __call__(
self,
model,
current_valid_map,
epoch,
OUT_DIR,
):
if current_valid_map > self.best_valid_map:
self.best_valid_map = current_valid_map
print(f"\nBEST VALIDATION mAP: {self.best_valid_map}")
print(f"SAVING BEST MODEL FOR EPOCH: {epoch+1}\n")
torch.save(
{
"epoch": epoch + 1,
"model_state_dict": model.state_dict(),
},
f"{OUT_DIR}/best_model.pth",
)
def collate_fn(batch):
"""
To handle the data loading as different images may have different
numbers of objects, and to handle varying-size tensors as well.
"""
return tuple(zip(*batch))
def get_train_transform():
# We keep "pascal_voc" because bounding box format is [x_min, y_min, x_max, y_max].
return A.Compose(
[
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.5),
A.Rotate(limit=45),
A.Blur(blur_limit=3, p=0.2),
A.MotionBlur(blur_limit=3, p=0.1),
A.MedianBlur(blur_limit=3, p=0.1),
A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.3),
A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2, p=0.3),
A.RandomScale(scale_limit=0.2, p=0.3),
ToTensorV2(p=1.0),
],
bbox_params={"format": "pascal_voc", "label_fields": ["labels"]},
)
def get_valid_transform():
return A.Compose(
[
ToTensorV2(p=1.0),
],
bbox_params={"format": "pascal_voc", "label_fields": ["labels"]},
)
def show_tranformed_image(train_loader):
"""
Visualize transformed images from the `train_loader` for debugging.
Only runs if `VISUALIZE_TRANSFORMED_IMAGES = True` in config.py.
"""
if len(train_loader) > 0:
for i in range(2):
images, targets = next(iter(train_loader))
images = list(image.to(DEVICE) for image in images)
targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
for i in range(len(images)):
if len(targets[i]["boxes"]) == 0:
continue
boxes = targets[i]["boxes"].cpu().numpy().astype(np.int32)
labels = targets[i]["labels"].cpu().numpy().astype(np.int32)
sample = images[i].permute(1, 2, 0).cpu().numpy()
sample = cv2.cvtColor(sample, cv2.COLOR_RGB2BGR)
for box_num, box in enumerate(boxes):
cv2.rectangle(sample, (box[0], box[1]), (box[2], box[3]), (0, 0, 255), 2)
cv2.putText(
sample,
CLASSES[labels[box_num]],
(box[0], box[1] - 10),
cv2.FONT_HERSHEY_SIMPLEX,
1.0,
(0, 0, 255),
2,
)
cv2_imshow(sample)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
def save_model(epoch, model, optimizer):
"""
Save the trained model (state dict) and optimizer state to disk.
"""
torch.save(
{
"epoch": epoch + 1,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},
"outputs/last_model.pth",
)
def save_loss_plot(OUT_DIR, train_loss_list, x_label="iterations", y_label="train loss", save_name="train_loss"):
"""
Saves the training loss curve.
"""
plt.figure(figsize=(10, 7))
plt.plot(train_loss_list, color="tab:blue")
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.savefig(f"{OUT_DIR}/{save_name}.png")
# plt.close()
print("SAVING PLOTS COMPLETE...")
def save_mAP(OUT_DIR, map_05, map):
"""
Saves the mAP@0.5 and mAP@0.5:0.95 curves per epoch.
"""
plt.figure(figsize=(10, 7))
plt.plot(map_05, color="tab:orange", linestyle="-", label="mAP@0.5")
plt.plot(map, color="tab:red", linestyle="-", label="mAP@0.5:0.95")
plt.xlabel("Epochs")
plt.ylabel("mAP")
plt.legend()
plt.savefig(f"{OUT_DIR}/map.png")
# plt.close()
print("SAVING mAP PLOTS COMPLETE...")
Datasets.py¶
In [ ]:
import torch
import cv2
import numpy as np
import os
import glob
from google.colab.patches import cv2_imshow
from torch.utils.data import Dataset, DataLoader
#from config import CLASSES, RESIZE_TO, TRAIN_DIR, BATCH_SIZE
#from custom_utils import collate_fn, get_train_transform, get_valid_transform
class CustomDataset(Dataset):
def __init__(self, dir_path, width, height, classes, transforms=None):
"""
:param dir_path: Directory containing 'images/' and 'labels/' subfolders.
:param width: Resized image width.
:param height: Resized image height.
:param classes: List of class names (or an indexing scheme).
:param transforms: Albumentations transformations to apply.
"""
self.transforms = transforms
self.dir_path = dir_path
self.image_dir = os.path.join(self.dir_path, "images")
self.label_dir = os.path.join(self.dir_path, "labels")
self.width = width
self.height = height
self.classes = classes
# Gather all image paths
self.image_file_types = ["*.jpg", "*.jpeg", "*.png", "*.ppm", "*.JPG"]
self.all_image_paths = []
for file_type in self.image_file_types:
self.all_image_paths.extend(glob.glob(os.path.join(self.image_dir, file_type)))
# Sort for consistent ordering
self.all_image_paths = sorted(self.all_image_paths)
self.all_image_names = [os.path.basename(img_p) for img_p in self.all_image_paths]
def __len__(self):
return len(self.all_image_paths)
def __getitem__(self, idx):
# 1) Read image
image_name = self.all_image_names[idx]
image_path = os.path.join(self.image_dir, image_name)
label_filename = os.path.splitext(image_name)[0] + ".txt"
label_path = os.path.join(self.label_dir, label_filename)
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
# 2) Resize image (to the model's expected size)
image_resized = cv2.resize(image, (self.width, self.height))
image_resized /= 255.0 # Scale pixel values to [0, 1]
# 3) Read bounding boxes (normalized) from .txt file
boxes = []
labels = []
if os.path.exists(label_path):
with open(label_path, "r") as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if not line:
continue
# Format: class_id x_min y_min x_max y_max (all in [0..1])
parts = line.split()
class_id = int(parts[0]) # e.g. 0, 1, 2, ...
xmin = float(parts[1])
ymin = float(parts[2])
xmax = float(parts[3])
ymax = float(parts[4])
# Example: if you want class IDs to start at 1 for foreground
# and background=0, do:
label_idx = class_id + 1
# Convert normalized coords to absolute (in resized space)
x_min_final = xmin * self.width
y_min_final = ymin * self.height
x_max_final = xmax * self.width
y_max_final = ymax * self.height
# Ensure valid box
if x_max_final <= x_min_final:
x_max_final = x_min_final + 1
if y_max_final <= y_min_final:
y_max_final = y_min_final + 1
# Clip if out of bounds
x_min_final = max(0, min(x_min_final, self.width - 1))
x_max_final = max(0, min(x_max_final, self.width))
y_min_final = max(0, min(y_min_final, self.height - 1))
y_max_final = max(0, min(y_max_final, self.height))
boxes.append([x_min_final, y_min_final, x_max_final, y_max_final])
labels.append(label_idx)
# 4) Convert boxes & labels to Torch tensors
if len(boxes) == 0:
boxes = torch.zeros((0, 4), dtype=torch.float32)
labels = torch.zeros((0,), dtype=torch.int64)
else:
boxes = torch.tensor(boxes, dtype=torch.float32)
labels = torch.tensor(labels, dtype=torch.int64)
# 5) Prepare the target dict
area = (
(boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
if len(boxes) > 0
else torch.tensor([], dtype=torch.float32)
)
iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)
image_id = torch.tensor([idx])
target = {"boxes": boxes, "labels": labels, "area": area, "iscrowd": iscrowd, "image_id": image_id}
# 6) Albumentations transforms: pass Python lists, not Tensors
if self.transforms:
bboxes_list = boxes.cpu().numpy().tolist() # shape: list of [xmin, ymin, xmax, ymax]
labels_list = labels.cpu().numpy().tolist() # shape: list of ints
transformed = self.transforms(
image=image_resized,
bboxes=bboxes_list,
labels=labels_list,
)
# Reassign the image
image_resized = transformed["image"]
# Convert bboxes back to Torch Tensors
new_bboxes_list = transformed["bboxes"] # list of [xmin, ymin, xmax, ymax]
new_labels_list = transformed["labels"] # list of int
if len(new_bboxes_list) > 0:
new_bboxes = torch.tensor(new_bboxes_list, dtype=torch.float32)
new_labels = torch.tensor(new_labels_list, dtype=torch.int64)
else:
new_bboxes = torch.zeros((0, 4), dtype=torch.float32)
new_labels = torch.zeros((0,), dtype=torch.int64)
target["boxes"] = new_bboxes
target["labels"] = new_labels
return image_resized, target
# ---------------------------------------------------------
# Create train/valid datasets and loaders
# ---------------------------------------------------------
def create_train_dataset(DIR):
train_dataset = CustomDataset(
dir_path=DIR, width=RESIZE_TO, height=RESIZE_TO, classes=CLASSES, transforms=get_train_transform()
)
return train_dataset
def create_valid_dataset(DIR):
valid_dataset = CustomDataset(
dir_path=DIR, width=RESIZE_TO, height=RESIZE_TO, classes=CLASSES, transforms=get_valid_transform()
)
return valid_dataset
def create_train_loader(train_dataset, num_workers=0):
train_loader = DataLoader(
train_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=num_workers,
collate_fn=collate_fn,
drop_last=True,
)
return train_loader
def create_valid_loader(valid_dataset, num_workers=0):
valid_loader = DataLoader(
valid_dataset,
batch_size=BATCH_SIZE,
shuffle=False,
num_workers=num_workers,
collate_fn=collate_fn,
drop_last=True,
)
return valid_loader
# ---------------------------------------------------------
# Debug/demo if run directly
# ---------------------------------------------------------
if __name__ == "__main__":
# Example usage with no transforms for debugging
dataset = CustomDataset(dir_path=TRAIN_DIR, width=RESIZE_TO, height=RESIZE_TO, classes=CLASSES, transforms=None)
print(f"Number of training images: {len(dataset)}")
def visualize_sample(image, target):
"""
Visualize a single sample using OpenCV. Expects
`image` as a NumPy array of shape (H, W, 3) in [0..1].
"""
# Convert [0,1] float -> [0,255] uint8
img = (image * 255).astype(np.uint8)
# Convert RGB -> BGR
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
boxes = target["boxes"].cpu().numpy().astype(np.int32)
labels = target["labels"].cpu().numpy().astype(np.int32)
for i, box in enumerate(boxes):
x1, y1, x2, y2 = box
class_idx = labels[i]
# If your class_idx starts at 1 for "first class", ensure you handle that:
# e.g. if CLASSES = ["background", "class1", "class2", ...]
if 0 <= class_idx < len(CLASSES):
class_str = CLASSES[class_idx]
else:
class_str = f"Label_{class_idx}"
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
cv2.putText(img, class_str, (x1, max(y1 - 5, 0)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
cv2_imshow(img)
# cv2.waitKey(0)
# Visualize a few samples
NUM_SAMPLES_TO_VISUALIZE = 10
for i in range(NUM_SAMPLES_TO_VISUALIZE):
image, target = dataset[i] # No transforms in this example
# `image` is shape (H, W, 3) in [0..1]
print(f"Visualizing sample {i}, boxes: {target['boxes'].shape[0]}")
visualize_sample(image, target)
# cv2.destroyAllWindows()
Number of training images: 1279 Visualizing sample 0, boxes: 1
Visualizing sample 1, boxes: 3
Visualizing sample 2, boxes: 2
Visualizing sample 3, boxes: 1
Visualizing sample 4, boxes: 1
Visualizing sample 5, boxes: 1
Visualizing sample 6, boxes: 1
Visualizing sample 7, boxes: 1
Visualizing sample 8, boxes: 1
Visualizing sample 9, boxes: 1
model.py¶
In [ ]:
import torchvision
import torch
from functools import partial
from torchvision.models.detection import RetinaNet_ResNet50_FPN_V2_Weights
from torchvision.models.detection.retinanet import RetinaNetClassificationHead
#from config import NUM_CLASSES
def create_model(num_classes=91):
"""
Creates a RetinaNet-ResNet50-FPN v2 model pre-trained on COCO.
Replaces the classification head for the required number of classes.
"""
model = torchvision.models.detection.retinanet_resnet50_fpn_v2(weights=RetinaNet_ResNet50_FPN_V2_Weights.COCO_V1)
num_anchors = model.head.classification_head.num_anchors
# Replace the classification head
model.head.classification_head = RetinaNetClassificationHead(
in_channels=256, num_anchors=num_anchors, num_classes=num_classes, norm_layer=partial(torch.nn.GroupNorm, 32)
)
return model
if __name__ == "__main__":
model = create_model(num_classes=NUM_CLASSES)
print(model)
# Total parameters:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
# Trainable parameters:
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
RetinaNet( (backbone): BackboneWithFPN( (body): IntermediateLayerGetter( (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False) (layer1): Sequential( (0): Bottleneck( (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer2): Sequential( (0): Bottleneck( (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer3): Sequential( (0): Bottleneck( (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (4): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (5): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer4): Sequential( (0): Bottleneck( (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) ) (fpn): FeaturePyramidNetwork( (inner_blocks): ModuleList( (0): Conv2dNormActivation( (0): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)) ) (1): Conv2dNormActivation( (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) ) (2): Conv2dNormActivation( (0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (layer_blocks): ModuleList( (0-2): 3 x Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (extra_blocks): LastLevelP6P7( (p6): Conv2d(2048, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) (p7): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) ) (anchor_generator): AnchorGenerator() (head): RetinaNetHead( (classification_head): RetinaNetClassificationHead( (conv): Sequential( (0): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (1): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (2): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (3): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) ) (cls_logits): Conv2d(256, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (regression_head): RetinaNetRegressionHead( (conv): Sequential( (0): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (1): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (2): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (3): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) ) (bbox_reg): Conv2d(256, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (transform): GeneralizedRCNNTransform( Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) Resize(min_size=(800,), max_size=1333, mode='bilinear') ) ) 36,414,865 total parameters. 36,189,521 training parameters.
In [ ]:
!pip install torchmetrics -q
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 983.0/983.0 kB 18.3 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 363.4/363.4 MB 3.2 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.8/13.8 MB 58.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 10.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 883.7/883.7 kB 25.3 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 848.7 kB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 211.5/211.5 MB 4.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.3/56.3 MB 12.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.9/127.9 MB 7.9 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 MB 2.0 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 188.7/188.7 MB 5.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 87.1 MB/s eta 0:00:00
train.py¶
In [ ]:
#from config import (
# DEVICE,
# NUM_CLASSES,
# NUM_EPOCHS,
# OUT_DIR,
# VISUALIZE_TRANSFORMED_IMAGES,
# NUM_WORKERS,
# RESIZE_TO,
# VALID_DIR,
# TRAIN_DIR,
#)
#from model import create_model
#from custom_utils import Averager, SaveBestModel, save_model, save_loss_plot, save_mAP
import torch
import matplotlib.pyplot as plt
import time
import os
from tqdm.auto import tqdm
#from datasets import create_train_dataset, create_valid_dataset, create_train_loader, create_valid_loader
#from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torchmetrics.detection import MeanAveragePrecision
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
import torch
import matplotlib.pyplot as plt
import time
import os
plt.style.use("ggplot")
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Function for running training iterations.
def train(train_data_loader, model):
print("Training")
model.train()
# initialize tqdm progress bar
prog_bar = tqdm(train_data_loader, total=len(train_data_loader))
for i, data in enumerate(prog_bar):
optimizer.zero_grad()
images, targets = data
images = list(image.to(DEVICE) for image in images)
targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
loss_value = losses.item()
train_loss_hist.send(loss_value)
losses.backward()
optimizer.step()
# update the loss value beside the progress bar for each iteration
prog_bar.set_description(desc=f"Loss: {loss_value:.4f}")
return loss_value
# Function for running validation iterations.
def validate(valid_data_loader, model):
print("Validating")
model.eval()
# Initialize tqdm progress bar.
prog_bar = tqdm(valid_data_loader, total=len(valid_data_loader))
target = []
preds = []
for i, data in enumerate(prog_bar):
images, targets = data
images = list(image.to(DEVICE) for image in images)
targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
with torch.no_grad():
outputs = model(images, targets)
# For mAP calculation using Torchmetrics.
#####################################
for i in range(len(images)):
true_dict = dict()
preds_dict = dict()
true_dict["boxes"] = targets[i]["boxes"].detach().cpu()
true_dict["labels"] = targets[i]["labels"].detach().cpu()
preds_dict["boxes"] = outputs[i]["boxes"].detach().cpu()
preds_dict["scores"] = outputs[i]["scores"].detach().cpu()
preds_dict["labels"] = outputs[i]["labels"].detach().cpu()
preds.append(preds_dict)
target.append(true_dict)
#####################################
metric.reset()
metric.update(preds, target)
metric_summary = metric.compute()
return metric_summary
if __name__ == "__main__":
os.makedirs("outputs", exist_ok=True)
train_dataset = create_train_dataset(TRAIN_DIR)
valid_dataset = create_valid_dataset(VALID_DIR)
train_loader = create_train_loader(train_dataset, NUM_WORKERS)
valid_loader = create_valid_loader(valid_dataset, NUM_WORKERS)
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(valid_dataset)}\n")
# Initialize the model and move to the computation device.
model = create_model(num_classes=NUM_CLASSES)
model = model.to(DEVICE)
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.01, momentum=0.9, nesterov=True, weight_decay=0.0005)
scheduler = ReduceLROnPlateau(
optimizer,
mode="max", # we want to maximize mAP
factor=0.1, # reduce LR by this factor
patience=8, # wait 3 epochs with no improvement
threshold=0.005, # how much improvement is considered significant
cooldown=1,
)
# To monitor training loss
train_loss_hist = Averager()
# To store training loss and mAP values.
train_loss_list = []
map_50_list = []
map_list = []
# Mame to save the trained model with.
MODEL_NAME = "model"
# Whether to show transformed images from data loader or not.
if VISUALIZE_TRANSFORMED_IMAGES:
#from custom_utils import show_tranformed_image
show_tranformed_image(train_loader)
# To save best model.
save_best_model = SaveBestModel()
metric = MeanAveragePrecision()
metric.warn_on_many_detections = False
# Training loop.
for epoch in range(NUM_EPOCHS):
print(f"\nEPOCH {epoch+1} of {NUM_EPOCHS}")
# Reset the training loss histories for the current epoch.
train_loss_hist.reset()
# Start timer and carry out training and validation.
start = time.time()
train_loss = train(train_loader, model)
metric_summary = validate(valid_loader, model)
current_map_05_95 = float(metric_summary["map"])
current_map_05 = float(metric_summary["map_50"])
print(f"Epoch #{epoch+1} train loss: {train_loss_hist.value:.3f}")
print(f"Epoch #{epoch+1} mAP: {metric_summary['map']:.3f}")
end = time.time()
print(f"Took {((end - start) / 60):.3f} minutes for epoch {epoch+1}")
train_loss_list.append(train_loss)
map_50_list.append(metric_summary["map_50"])
map_list.append(metric_summary["map"])
# save the best model till now.
save_best_model(model, float(metric_summary["map"]), epoch, "outputs")
# Save the current epoch model.
save_model(epoch, model, optimizer)
# Save loss plot.
save_loss_plot(OUT_DIR, train_loss_list)
# Save mAP plot.
save_mAP(OUT_DIR, map_50_list, map_list)
scheduler.step(current_map_05_95)
print("Current LR:", scheduler.get_last_lr())
/usr/local/lib/python3.11/dist-packages/albumentations/core/composition.py:331: UserWarning: Got processor for bboxes, but no transform to process it. self._set_keys()
Number of training samples: 1279 Number of validation samples: 225 RetinaNet( (backbone): BackboneWithFPN( (body): IntermediateLayerGetter( (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False) (layer1): Sequential( (0): Bottleneck( (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer2): Sequential( (0): Bottleneck( (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer3): Sequential( (0): Bottleneck( (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (4): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (5): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer4): Sequential( (0): Bottleneck( (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) ) (fpn): FeaturePyramidNetwork( (inner_blocks): ModuleList( (0): Conv2dNormActivation( (0): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)) ) (1): Conv2dNormActivation( (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) ) (2): Conv2dNormActivation( (0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (layer_blocks): ModuleList( (0-2): 3 x Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (extra_blocks): LastLevelP6P7( (p6): Conv2d(2048, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) (p7): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) ) (anchor_generator): AnchorGenerator() (head): RetinaNetHead( (classification_head): RetinaNetClassificationHead( (conv): Sequential( (0): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (1): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (2): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (3): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) ) (cls_logits): Conv2d(256, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (regression_head): RetinaNetRegressionHead( (conv): Sequential( (0): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (1): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (2): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) (3): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): GroupNorm(32, 256, eps=1e-05, affine=True) (2): ReLU(inplace=True) ) ) (bbox_reg): Conv2d(256, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (transform): GeneralizedRCNNTransform( Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) Resize(min_size=(800,), max_size=1333, mode='bilinear') ) ) 36,414,865 total parameters. 36,189,521 training parameters.
EPOCH 1 of 30 Training
0%| | 0/159 [00:00<?, ?it/s]
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) /tmp/ipython-input-3958621971.py in <cell line: 0>() 163 # Start timer and carry out training and validation. 164 start = time.time() --> 165 train_loss = train(train_loader, model) 166 metric_summary = validate(valid_loader, model) 167 current_map_05_95 = float(metric_summary["map"]) /tmp/ipython-input-3958621971.py in train(train_data_loader, model) 50 images = list(image.to(DEVICE) for image in images) 51 targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets] ---> 52 loss_dict = model(images, targets) 53 54 losses = sum(loss for loss in loss_dict.values()) /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs) 1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1738 else: -> 1739 return self._call_impl(*args, **kwargs) 1740 1741 # torchrec tests the code consistency with the following code /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs) 1748 or _global_backward_pre_hooks or _global_backward_hooks 1749 or _global_forward_hooks or _global_forward_pre_hooks): -> 1750 return forward_call(*args, **kwargs) 1751 1752 result = None /usr/local/lib/python3.11/dist-packages/torchvision/models/detection/retinanet.py in forward(self, images, targets) 627 628 # get the features from the backbone --> 629 features = self.backbone(images.tensors) 630 if isinstance(features, torch.Tensor): 631 features = OrderedDict([("0", features)]) /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs) 1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1738 else: -> 1739 return self._call_impl(*args, **kwargs) 1740 1741 # torchrec tests the code consistency with the following code /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs) 1748 or _global_backward_pre_hooks or _global_backward_hooks 1749 or _global_forward_hooks or _global_forward_pre_hooks): -> 1750 return forward_call(*args, **kwargs) 1751 1752 result = None /usr/local/lib/python3.11/dist-packages/torchvision/models/detection/backbone_utils.py in forward(self, x) 55 56 def forward(self, x: Tensor) -> Dict[str, Tensor]: ---> 57 x = self.body(x) 58 x = self.fpn(x) 59 return x /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs) 1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1738 else: -> 1739 return self._call_impl(*args, **kwargs) 1740 1741 # torchrec tests the code consistency with the following code /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs) 1748 or _global_backward_pre_hooks or _global_backward_hooks 1749 or _global_forward_hooks or _global_forward_pre_hooks): -> 1750 return forward_call(*args, **kwargs) 1751 1752 result = None /usr/local/lib/python3.11/dist-packages/torchvision/models/_utils.py in forward(self, x) 67 out = OrderedDict() 68 for name, module in self.items(): ---> 69 x = module(x) 70 if name in self.return_layers: 71 out_name = self.return_layers[name] /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs) 1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1738 else: -> 1739 return self._call_impl(*args, **kwargs) 1740 1741 # torchrec tests the code consistency with the following code /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs) 1748 or _global_backward_pre_hooks or _global_backward_hooks 1749 or _global_forward_hooks or _global_forward_pre_hooks): -> 1750 return forward_call(*args, **kwargs) 1751 1752 result = None /usr/local/lib/python3.11/dist-packages/torch/nn/modules/conv.py in forward(self, input) 552 553 def forward(self, input: Tensor) -> Tensor: --> 554 return self._conv_forward(input, self.weight, self.bias) 555 556 /usr/local/lib/python3.11/dist-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias) 547 self.groups, 548 ) --> 549 return F.conv2d( 550 input, weight, bias, self.stride, self.padding, self.dilation, self.groups 551 ) /usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/signal_handling.py in handler(signum, frame) 71 # This following call uses `waitid` with WNOHANG from C side. Therefore, 72 # Python can still get and update the process status successfully. ---> 73 _error_if_any_worker_fails() 74 if previous_handler is not None: 75 assert callable(previous_handler) RuntimeError: DataLoader worker (pid 78005) is killed by signal: Killed.
APP py¶
In [ ]:
import os
import cv2
import time
import torch
import gradio as gr
import numpy as np
# Make sure these are your local imports from your project.
#from model import create_model
#from config import NUM_CLASSES, DEVICE, CLASSES
# ----------------------------------------------------------------
# GLOBAL SETUP
# ----------------------------------------------------------------
# Create the model and load the best weights.
model = create_model(num_classes=NUM_CLASSES)
checkpoint = torch.load("outputs/best_model_79.pth", map_location=DEVICE)
model.load_state_dict(checkpoint["model_state_dict"])
model.to(DEVICE).eval()
# Create a colors array for each class index.
# (length matches len(CLASSES), including background if you wish).
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
# COLORS = [
# (255, 255, 0), # Cyan - background
# (50, 0, 255), # Red - buffalo
# (147, 20, 255), # Pink - elephant
# (0, 255, 0), # Green - rhino
# (238, 130, 238), # Violet - zebra
# ]
# ----------------------------------------------------------------
# HELPER FUNCTIONS
# ----------------------------------------------------------------
def inference_on_image(orig_image: np.ndarray, resize_dim=None, threshold=0.25):
"""
Runs inference on a single image (OpenCV BGR or NumPy array).
- resize_dim: if not None, we resize to (resize_dim, resize_dim)
- threshold: detection confidence threshold
Returns: processed image with bounding boxes drawn.
"""
image = orig_image.copy()
# Optionally resize for inference.
if resize_dim is not None:
image = cv2.resize(image, (resize_dim, resize_dim))
# Convert BGR to RGB, normalize [0..1]
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
# Move channels to front (C,H,W)
image_tensor = torch.tensor(image_rgb.transpose(2, 0, 1), dtype=torch.float).unsqueeze(0).to(DEVICE)
start_time = time.time()
# Inference
with torch.no_grad():
outputs = model(image_tensor)
end_time = time.time()
# Get the current fps.
fps = 1 / (end_time - start_time)
fps_text = f"FPS: {fps:.2f}"
# Move outputs to CPU numpy
outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs]
boxes = outputs[0]["boxes"].numpy()
scores = outputs[0]["scores"].numpy()
labels = outputs[0]["labels"].numpy().astype(int)
# Filter out boxes with low confidence
valid_idx = np.where(scores >= threshold)[0]
boxes = boxes[valid_idx].astype(int)
labels = labels[valid_idx]
# If we resized for inference, rescale boxes back to orig_image size
if resize_dim is not None:
h_orig, w_orig = orig_image.shape[:2]
h_new, w_new = resize_dim, resize_dim
# scale boxes
boxes[:, [0, 2]] = (boxes[:, [0, 2]] / w_new) * w_orig
boxes[:, [1, 3]] = (boxes[:, [1, 3]] / h_new) * h_orig
# Draw bounding boxes
for box, label_idx in zip(boxes, labels):
class_name = CLASSES[label_idx] if 0 <= label_idx < len(CLASSES) else str(label_idx)
color = COLORS[label_idx % len(COLORS)][::-1] # BGR color
cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), color, 5)
cv2.putText(orig_image, class_name, (box[0], box[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3)
cv2.putText(
orig_image,
fps_text,
(int((w_orig / 2) - 50), 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(0, 255, 0),
2,
cv2.LINE_AA,
)
return orig_image, fps
def inference_on_frame(frame: np.ndarray, resize_dim=None, threshold=0.25):
"""
Same as inference_on_image but for a single video frame.
Returns the processed frame with bounding boxes.
"""
return inference_on_image(frame, resize_dim, threshold)
# ----------------------------------------------------------------
# GRADIO FUNCTIONS
# ----------------------------------------------------------------
def img_inf(image_path, resize_dim, threshold):
"""
Gradio function for image inference.
:param image_path: File path from Gradio (uploaded image).
:param model_name: Selected model from Radio (not used if only one model).
Returns: A NumPy image array with bounding boxes.
"""
if image_path is None:
return None # No image provided
orig_image = cv2.imread(image_path) # BGR
if orig_image is None:
return None # Error reading image
result_image, _ = inference_on_image(orig_image, resize_dim=resize_dim, threshold=threshold)
# Return the image in RGB for Gradio's display
result_image_rgb = cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB)
return result_image_rgb
def vid_inf(video_path, resize_dim, threshold):
"""
Gradio function for video inference.
Processes each frame, draws bounding boxes, and writes to an output video.
Returns: (last_processed_frame, output_video_file_path)
"""
if video_path is None:
return None, None # No video provided
# Prepare input capture
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return None, None
# Create an output file path
os.makedirs("inference_outputs/videos", exist_ok=True)
out_video_path = os.path.join("inference_outputs/videos", "video_output.mp4")
# out_video_path = "video_output.mp4"
# Get video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*"mp4v") # or 'XVID'
# If FPS is 0 (some weird container), default to something
if fps <= 0:
fps = 20.0
out_writer = cv2.VideoWriter(out_video_path, fourcc, fps, (width, height))
frame_count = 0
total_fps = 0
while True:
ret, frame = cap.read()
if not ret:
break
# Inference on frame
processed_frame, frame_fps = inference_on_frame(frame, resize_dim=resize_dim, threshold=threshold)
total_fps += frame_fps
frame_count += 1
# Write the processed frame
out_writer.write(processed_frame)
yield cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB), None
avg_fps = total_fps / frame_count
cap.release()
out_writer.release()
print(f"Average FPS: {avg_fps:.3f}")
yield None, out_video_path
# ----------------------------------------------------------------
# BUILD THE GRADIO INTERFACES
# ----------------------------------------------------------------
# For demonstration, we define two possible model radio choices.
# You can ignore or expand this if you only use RetinaNet.
resize_dim = gr.Slider(100, 1024, value=640, label="Resize Dimension", info="Resize image to this dimension")
threshold = gr.Slider(0, 1, value=0.5, label="Threshold", info="Confidence threshold for detection")
inputs_image = gr.Image(type="filepath", label="Input Image")
outputs_image = gr.Image(type="numpy", label="Output Image")
interface_image = gr.Interface(
fn=img_inf,
inputs=[inputs_image, resize_dim, threshold],
outputs=outputs_image,
title="Image Inference",
description="Upload your photo, select a model, and see the results!",
examples=[["examples/buffalo.jpg"], ["examples/zebra.jpg"]],
cache_examples=False,
)
resize_dim = gr.Slider(100, 1024, value=640, label="Resize Dimension", info="Resize image to this dimension")
threshold = gr.Slider(0, 1, value=0.5, label="Threshold", info="Confidence threshold for detection")
input_video = gr.Video(label="Input Video")
# Output is a pair: (last_processed_frame, output_video_path)
output_frame = gr.Image(type="numpy", label="Output (Last Processed Frame)")
output_video_file = gr.Video(format="mp4", label="Output Video")
interface_video = gr.Interface(
fn=vid_inf,
inputs=[input_video, resize_dim, threshold],
outputs=[output_frame, output_video_file],
title="Video Inference",
description="Upload your video and see the processed output!",
examples=[["examples/elephants.mp4"], ["examples/rhino.mp4"]],
cache_examples=False,
)
# Combine them in a Tabbed Interface
demo = (
gr.TabbedInterface(
[interface_image, interface_video],
tab_names=["Image", "Video"],
title="FineTuning RetinaNet for Wildlife Animal Detection",
theme="gstaff/xkcd",
)
.queue()
.launch()
)