YOLOv3学习：（二）网络结构推导与实现

网络结构图简版：

网络结构图简版+特征图的大小变换：

网络结构-详细版

网络结构模块化

网络结构图展开(超详细版)

网络结构+示例-3D版（利用多尺度特征进行对象检测）

9种尺度的先验框

随着输出的特征图的数量和尺度的变化，先验框的尺寸也需要相应的调整。YOLO2已经开始采用K-means聚类得到先验框的尺寸，YOLO3延续了这种方法，为每种下采样尺度设定3种先验框，总共聚类出9种尺寸的先验框。在COCO数据集这9个先验框是：(10x13)，(16x30)，(33x23)，(30x61)，(62x45)，(59x119)，(116x90)，(156x198)，(373x326)。

分配上，在最小的1313特征图上（有最大的感受野）应用较大的先验框(116x90)，(156x198)，(373x326)，适合检测较大的对象。中等的2626特征图上（中等感受野）应用中等的先验框(30x61)，(62x45)，(59x119)，适合检测中等大小的对象。较大的52*52特征图上（较小的感受野）应用较小的先验框(10x13)，(16x30)，(33x23)，适合检测较小的对象。

感受一下9种先验框的尺寸，下图中蓝色框为聚类得到的先验框。黄色框式ground truth，红框是对象中心点所在的网格。

输入到输出的映射(包含输出参数的解释)

不考虑神经网络结构细节的话，总的来说，对于一个输入图像，YOLO3将其映射到3个尺度的输出张量，代表图像各个位置存在各种对象的概率。

我们看一下YOLO3共进行了多少个预测。对于一个416416的输入图像，在每个尺度的特征图的每个网格设置3个先验框，总共有 13133 + 26263 + 5252*3 = 10647 个预测。每一个预测是一个(4+1+80)=85维向量，这个85维向量包含边框坐标（4个数值），边框置信度（1个数值），对象类别的概率（对于COCO数据集，有80种对象）。

对比一下，YOLO2采用13135 = 845个预测，YOLO3的尝试预测边框数量增加了10多倍，而且是在不同分辨率上进行，所以mAP以及对小物体的检测效果有一定的提升。

代码实现

代码

import torch
import torch.nn as nn

# Darknet53 中的基本块--卷积块，由Conv+BN+LeakyReLU共同组成
class ConvBNReLU(nn.Module):
    def __init__(self,in_channels,out_channels,kernel_size,stride,padding):
        super(ConvBNReLU,self).__init__()
        self.conv = nn.Conv2d(in_channels,out_channels,kernel_size,stride,padding)
        self.BN = nn.BatchNorm2d(out_channels)
        self.leaky_relu = nn.ReLU6(inplace=True)
    def forward(self,x):
        x = self.conv(x)
        x = self.BN(x)
        x = self.leaky_relu(x)
        return x

# Darknet53 中的基本块--下采样块,用卷积(stride=2)实现
class DownSample(nn.Module):
    def __init__(self,in_channels,out_channels):
        super(DownSample,self).__init__()
        self.down_samp = nn.Conv2d(in_channels,out_channels,3,2,1)
    def forward(self,x):
        x = self.down_samp(x)
        return x

# Darknet53 中的基本块--ResBlock
class ResBlock(nn.Module):
    def __init__(self, nchannels):
        super(ResBlock, self).__init__()
        mid_channels = nchannels // 2
        self.conv1x1 = ConvBNReLU(nchannels, mid_channels,1,1,0)
        self.conv3x3 = ConvBNReLU(mid_channels, nchannels,3,1,1)

    def forward(self, x):
        out = self.conv3x3(self.conv1x1(x))
        return out + x
    
# YOLOv3 骨干网络 -DarkNet53
class DarkNet53_YOLOv3(nn.Module):
    def __init__(self):
        super(DarkNet53_YOLOv3, self).__init__()
        self.conv_bn_relu = ConvBNReLU(3,32,3,1,1)
        self.down_samp_0 = DownSample(32,64)
        self.res_block_1 = ResBlock(64)
        self.down_samp_1 = DownSample(64,128)
        self.res_block_2 = ResBlock(128)
        self.down_samp_2 = DownSample(128,256)
        self.res_block_3 = ResBlock(256)
        self.down_samp_3 = DownSample(256,512)
        self.res_block_4 = ResBlock(512)
        self.down_samp_4 = DownSample(512,1024)
        self.res_block_5 = ResBlock(1024)

    def forward(self, x):
        out1 = self.conv_bn_relu(x)
        
        out1 = self.down_samp_0(out1)
        
        out1 = self.res_block_1(out1)
        
        out1 = self.down_samp_1(out1)
        
        out1 = self.res_block_2(out1)
        out1 = self.res_block_2(out1)
        
        out1 = self.down_samp_2(out1)
        
        out1 = self.res_block_3(out1)
        out1 = self.res_block_3(out1)
        out1 = self.res_block_3(out1)
        out1 = self.res_block_3(out1)
        out1 = self.res_block_3(out1)
        out1 = self.res_block_3(out1)
        out1 = self.res_block_3(out1)
        out1 = self.res_block_3(out1)
        out1 = self.res_block_3(out1)
        
        out2 = self.down_samp_3(out1)
        
        out2 = self.res_block_4(out2)
        out2 = self.res_block_4(out2)
        out2 = self.res_block_4(out2)
        out2 = self.res_block_4(out2)
        out2 = self.res_block_4(out2)
        out2 = self.res_block_4(out2)
        out2 = self.res_block_4(out2)
        out2 = self.res_block_4(out2)
        out2 = self.res_block_4(out2)
        
        out3 = self.down_samp_4(out2)
        
        out3 = self.res_block_5(out3)
        out3 = self.res_block_5(out3)
        out3 = self.res_block_5(out3)
        out3 = self.res_block_5(out3)
        out3 = self.res_block_5(out3)
        
        return out1,out2,out3

# YOLOv3 13*13 输出分支的darknet53后的几层
class Out1LastLayers(nn.Module): #input_shape = (1024, 13, 13) out_shape = (255,13,13) out_branck_shape = (512,13,13)
    def __init__(self):
        super(Out1LastLayers, self).__init__()
        self.conv1x1 = ConvBNReLU(1024,512,1,1,0)
        self.conv3x3 = ConvBNReLU(512, 1024,3,1,1)
        self.conv1x1_last = ConvBNReLU(1024,255,1,1,0)
    
    def forward(self,x):
        out = self.conv1x1(x)
        out = self.conv3x3(out)
        
        out = self.conv1x1(out)
        out = self.conv3x3(out)
        
        out = self.conv1x1(out)
        out_branch = out
        out = self.conv3x3(out)
        
        out = self.conv1x1_last(out)
        
        return out,out_branch

# YOLOv3 26*26 输出分支的darknet53后的几层
class Out2LastLayers(nn.Module): #input_shape = (512, 26, 26) out_shape = (255,26,26) out_branck_shape = (256,26,26)
    def __init__(self):
        super(Out2LastLayers, self).__init__()
        self.conv1x1 = ConvBNReLU(512,256,1,1,0)
        self.conv3x3 = ConvBNReLU(256,512,3,1,1)
        self.up_sample = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv1x1_after_concat = ConvBNReLU(768,256,1,1,0)
        self.conv1x1_last = ConvBNReLU(512,255,1,1,0)
    
    def forward(self,x,x_branch):
        out = self.conv1x1(x_branch)
        out = self.up_sample(out)
        out = torch.cat([x,out],1)
        
        out = self.conv1x1_after_concat(out)
        out = self.conv3x3(out)
        
        out = self.conv1x1(out)
        out = self.conv3x3(out)
        
        out = self.conv1x1(out)
        out_branch = out
        out = self.conv3x3(out)
        
        out = self.conv1x1_last(out)
        
        return out,out_branch

# YOLOv3 52*52 输出分支的darknet53后的几层
class Out3LastLayers(nn.Module): #input_shape = (256, 52, 52) out_shape = (255,52,52) 
    def __init__(self):
        super(Out3LastLayers, self).__init__()
        self.conv1x1 = ConvBNReLU(256,128,1,1,0)
        self.conv3x3 = ConvBNReLU(128,256,3,1,1)
        self.up_sample = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv1x1_after_concat = ConvBNReLU(384,128,1,1,0)
        self.conv1x1_last = ConvBNReLU(256,255,1,1,0)
    
    def forward(self,x,x_branch):
        out = self.conv1x1(x_branch)
        out = self.up_sample(out)
        out = torch.cat([x,out],1)
        
        out = self.conv1x1_after_concat(out)
        out = self.conv3x3(out)
        
        out = self.conv1x1(out)
        out = self.conv3x3(out)
        
        out = self.conv1x1(out)
        out = self.conv3x3(out)
        
        out = self.conv1x1_last(out)
        
        return out

# YOLOv3模型
class YOLOv3(nn.Module):
    def __init__(self):
        super(YOLOv3, self).__init__()
        self.darknet53 = DarkNet53_YOLOv3()
        self.out1_last_layers =  Out1LastLayers()
        self.out2_last_layers =  Out2LastLayers()
        self.out3_last_layers =  Out3LastLayers()
        
        
    def forward(self, x):
        out3,out2,out1 = self.darknet53(x) # out1.shape,out2.shape,out3.shape =  (256, 52, 52),(512, 26, 26),(1024, 13, 13)
        out1,out1_branch = self.out1_last_layers(out1)
        out2,out2_branch = self.out2_last_layers(out2,out1_branch)
        out3 = self.out3_last_layers(out3,out2_branch)
        
        return out1,out2,out3

输入输出测试

fake_input = torch.zeros((1,3,416,416))
print(fake_input.shape)
model = YOLOv3()
out1,out2,out3= model(fake_input)
print(out1.shape,out2.shape,out3.shape)

torch.Size([1, 3, 416, 416])
torch.Size([1, 255, 13, 13]) torch.Size([1, 255, 26, 26]) torch.Size([1, 255, 52, 52])

参考资料

YOLOv3网络结构和解析：https://blog.csdn.net/dz4543/article/details/90049377
Darknet53网络各层参数详解：https://blog.csdn.net/qq_40210586/article/details/106144197
目标检测0-02：YOLO V3-网络结构输入输出解析：https://blog.csdn.net/weixin_43013761/article/details/98349080
YOLOv3 深入理解：https://www.jianshu.com/p/d13ae1055302

YOLOv3学习：（二）网络结构推导与实现

YOLOv3学习：（二）网络结构推导与实现

网络结构图简版：

网络结构图简版+特征图的大小变换：

网络结构-详细版

网络结构模块化

网络结构图展开(超详细版)

网络结构+示例-3D版（利用多尺度特征进行对象检测）

9种尺度的先验框

输入到输出的映射(包含输出参数的解释)

代码实现

代码

输入输出测试

参考资料

评论 (0)