VggNet解析

1、简介

论文原文： VggNet原文](https://arxiv.org/pdf/1409.1556.pdf))

VGG由牛津大学视觉几何小组(Visual Geometry Group, VGG)提出的一种深层卷积网，该网络在2014年获得定位任务的第一名，分类任务的第二名。VGG可以看成是加深版的AlexNet，都是conv + FC layer组成。

下图是VGG16模型的结构简图

vgg34

网络的亮点

通过堆叠多个3×3的卷积代替大尺度卷积核 (在保证相同感受野的前提下能够减少所需的参数量)

论文中提到，通过堆叠两个3×3的卷积核代替5×5的卷积核，堆叠三个3×3的卷积核代替7×7的卷积核。It is easy to see that a stack of two 3 × 3 conv layers (without spatial pooling in between) has an effective receptive field of 5 × 5; three such layers have a 7 × 7 effective receptive field.

下面给出一个实例

使用7×7卷积核所需参数，假设输入输出通道数均为C。

7×7×C×C = 49C²

堆叠3个3×3卷积核所需参数，假设输入输出通道数均为C。

3×3×C×C + 3×3×C×C + 3×3×C×C = 27C²

经过对比发现使用3层3×3的卷积层比使用7×7的卷积核参数更少。

下图是从原论文中截取的几种VGG模型的配置表，表中作者呈现了几种不同深度的配置(11层, 13层， 16层， 19层)是否使用LRN以及1×1卷积层与3×3卷积层的差异。

2、代码实现

1、pytorch实现

# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# @File       : model_vgg.py
# @Time       ：
# @Author     ：
# @version    ：python 3.9
# @Software   : PyCharm
# @Description：
"""
# ================【功能：】====================
import torch
import torch.nn as nn
import torch.nn.functional as F


# 先搭建vgg19
class VggNet(nn.Module):
    def __init__(self):
        super(VggNet, self).__init__()
        # (224 - 3 + 2*1)/1 + 1 = 224 -> (224, 224, 64)
        self.conv1_64_1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1, stride=1)
        # (223 - 3 + 2*1)1 + 1 = 224 -> (224, 224, 64)
        self.conv1_64_2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1, stride=1)
        # (224, 224, 64) -> (112, 112, 64)
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        # (112 - 3 + 2*1)/1 + 1 = 112  (112, 112, 64) -> (112, 112, 128)
        self.conv2_128_1 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv2_128_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1)
        # (112, 128, 128) -> (56, 56, 128)
        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        # (56 - 3 + 2 * 1)/ 1 + 1 = 56 (56, 56, 128) -> (56, 56, 256)
        self.conv3_256_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.conv3_256_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.conv3_256_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
        # maxpool  (56, 56, 256) -> (28, 28, 256)
        self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv4_512_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.conv4_512_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.conv4_512_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        # maxpool (28, 28, 512) -> (14, 14, 512)
        self.maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
        # (14, 14, 512)
        self.conv5_512_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.conv5_512_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.conv5_512_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        # maxpool (14, 14, 512) -> (7, 7, 512)
        self.maxpool5 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(7 * 7 * 512, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(in_features=4096, out_features=5)

    def forward(self, x):
        x = self.conv1_64_1(x)
        x = self.conv1_64_2(x)
        x = self.maxpool1(x)

        x = self.conv2_128_1(x)
        x = self.conv2_128_2(x)
        x = self.maxpool2(x)

        x = self.conv3_256_1(x)
        x = self.conv3_256_2(x)
        x = self.conv3_256_3(x)
        x = self.maxpool3(x)

        x = self.conv4_512_1(x)
        x = self.conv4_512_2(x)
        x = self.conv4_512_3(x)
        x = self.maxpool4(x)

        x = self.conv5_512_1(x)
        x = self.conv5_512_2(x)
        x = self.conv5_512_3(x)
        x = self.maxpool5(x)  # [batch, c, h, w] -> [-1, c*h*w]

        x = x.view(-1, 7 * 7 * 512)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# input = torch.rand([8, 3, 224, 224])
# vggnet = VggNet()
# print(vggnet)
# output = vggnet(input)
# print(output)

2、TensorFlow实现

# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# @File       : model_vggnet.py
# @Time       ：
# @Author     ：0399
# @version    ：python 3.9
# @Software   : PyCharm
# @Description：
"""
# ================【功能：】====================
import tensorflow as tf
from tensorflow.keras import layers, Model, Sequential

CONV_KERNEL_INITIALIZER = {
    'class_name': 'VarianceScaling',
    'config': {
        'scale': 2.0,
        'mode': 'fan_out',
        'distribution': 'truncated_normal'
    }
}

DENSE_KERNEL_INITIALIZER = {
    'class_name': 'VarianceScaling',
    'config': {
        'scale': 1. / 3.,
        'mode': 'fan_out',
        'distribution': 'uniform'
    }
}


def VGG(feature, im_height=224, im_width=224, num_classes=1000):
    # tensorflow中的tensor通道顺序是NHWC
    input_image = layers.Input(shape=(im_height, im_width, 3), dtype="float32")
    x = feature(input_image)
    x = layers.Flatten()(x)
    x = layers.Dropout(rate=0.5)(x)
    x = layers.Dense(2048, activation='relu',
                     kernel_initializer=DENSE_KERNEL_INITIALIZER)(x)
    x = layers.Dropout(rate=0.5)(x)
    x = layers.Dense(2048, activation='relu',
                     kernel_initializer=DENSE_KERNEL_INITIALIZER)(x)
    x = layers.Dropout(rate=0.5)(x)
    x = layers.Dense(num_classes, activation='relu',
                     kernel_initializer=DENSE_KERNEL_INITIALIZER)(x)
    output = layers.Softmax()(x)

    model = Model(inputs=input_image, outputs=output)
    return model


def make_feature(cfg):
    feature_layers = []
    for v in cfg:
        if v == 'M':
            feature_layers.append(layers.MaxPool2D(pool_size=2, strides=2))
        else:
            feature_layers.append(layers.Conv2D(v, kernel_size=3, padding="same", activation="relu",
                                                kernel_initializer=CONV_KERNEL_INITIALIZER))
    return Sequential(feature_layers, name="feature")


cfgs = {
    'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


def vgg(model_name="vgg16", im_height=224, im_width=224, num_classes=1000):
    cfg = cfgs[model_name]
    model = VGG(make_feature(cfg), im_height=im_height, im_width=im_width, num_classes=num_classes)
    return model


# input = tf.random.uniform((4, 224, 224, 3))
# vggnet = vgg("vgg16", num_classes=5)
# print(vggnet.summary())
# print(vggnet(input))