1、简介

论文原文: VggNet原文](https://arxiv.org/pdf/1409.1556.pdf))

VGG由牛津大学视觉几何小组(Visual Geometry Group, VGG)提出的一种深层卷积网, 该网络在2014年获得定位任务的第一名, 分类任务的第二名。VGG可以看成是加深版的AlexNet, 都是conv + FC layer组成。

下图是VGG16模型的结构简图

vgg34

网络的亮点

  • 通过堆叠多个3×3的卷积代替大尺度卷积核 (在保证相同感受野的前提下能够减少所需的参数量)

论文中提到,通过堆叠两个3×3的卷积核代替5×5的卷积核, 堆叠三个3×3的卷积核代替7×7的卷积核。It is easy to see that a stack of two 3 × 3 conv layers (without spatial pooling in between) has an effective receptive field of 5 × 5; three such layers have a 7 × 7 effective receptive field.

下面给出一个实例

使用7×7卷积核所需参数, 假设输入输出通道数均为C。

7×7×C×C = 49C²

堆叠3个3×3卷积核所需参数, 假设输入输出通道数均为C。

3×3×C×C + 3×3×C×C + 3×3×C×C = 27C²

经过对比发现使用3层3×3的卷积层比使用7×7的卷积核参数更少。

下图是从原论文中截取的几种VGG模型的配置表, 表中作者呈现了几种不同深度的配置(11层, 13层, 16层, 19层)是否使用LRN以及1×1卷积层与3×3卷积层的差异。

image

2、代码实现

1、pytorch实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# @File : model_vgg.py
# @Time :
# @Author :
# @version :python 3.9
# @Software : PyCharm
# @Description:
"""
# ================【功能:】====================
import torch
import torch.nn as nn
import torch.nn.functional as F


# 先搭建vgg19
class VggNet(nn.Module):
def __init__(self):
super(VggNet, self).__init__()
# (224 - 3 + 2*1)/1 + 1 = 224 -> (224, 224, 64)
self.conv1_64_1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1, stride=1)
# (223 - 3 + 2*1)1 + 1 = 224 -> (224, 224, 64)
self.conv1_64_2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1, stride=1)
# (224, 224, 64) -> (112, 112, 64)
self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
# (112 - 3 + 2*1)/1 + 1 = 112 (112, 112, 64) -> (112, 112, 128)
self.conv2_128_1 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
self.conv2_128_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1)
# (112, 128, 128) -> (56, 56, 128)
self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
# (56 - 3 + 2 * 1)/ 1 + 1 = 56 (56, 56, 128) -> (56, 56, 256)
self.conv3_256_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
self.conv3_256_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
self.conv3_256_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
# maxpool (56, 56, 256) -> (28, 28, 256)
self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv4_512_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv4_512_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv4_512_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
# maxpool (28, 28, 512) -> (14, 14, 512)
self.maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
# (14, 14, 512)
self.conv5_512_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv5_512_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv5_512_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
# maxpool (14, 14, 512) -> (7, 7, 512)
self.maxpool5 = nn.MaxPool2d(kernel_size=2, stride=2)
self.fc1 = nn.Linear(7 * 7 * 512, 4096)
self.fc2 = nn.Linear(4096, 4096)
self.fc3 = nn.Linear(in_features=4096, out_features=5)

def forward(self, x):
x = self.conv1_64_1(x)
x = self.conv1_64_2(x)
x = self.maxpool1(x)

x = self.conv2_128_1(x)
x = self.conv2_128_2(x)
x = self.maxpool2(x)

x = self.conv3_256_1(x)
x = self.conv3_256_2(x)
x = self.conv3_256_3(x)
x = self.maxpool3(x)

x = self.conv4_512_1(x)
x = self.conv4_512_2(x)
x = self.conv4_512_3(x)
x = self.maxpool4(x)

x = self.conv5_512_1(x)
x = self.conv5_512_2(x)
x = self.conv5_512_3(x)
x = self.maxpool5(x) # [batch, c, h, w] -> [-1, c*h*w]

x = x.view(-1, 7 * 7 * 512)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x

# input = torch.rand([8, 3, 224, 224])
# vggnet = VggNet()
# print(vggnet)
# output = vggnet(input)
# print(output)
2、TensorFlow实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# @File : model_vggnet.py
# @Time :
# @Author :0399
# @version :python 3.9
# @Software : PyCharm
# @Description:
"""
# ================【功能:】====================
import tensorflow as tf
from tensorflow.keras import layers, Model, Sequential

CONV_KERNEL_INITIALIZER = {
'class_name': 'VarianceScaling',
'config': {
'scale': 2.0,
'mode': 'fan_out',
'distribution': 'truncated_normal'
}
}

DENSE_KERNEL_INITIALIZER = {
'class_name': 'VarianceScaling',
'config': {
'scale': 1. / 3.,
'mode': 'fan_out',
'distribution': 'uniform'
}
}


def VGG(feature, im_height=224, im_width=224, num_classes=1000):
# tensorflow中的tensor通道顺序是NHWC
input_image = layers.Input(shape=(im_height, im_width, 3), dtype="float32")
x = feature(input_image)
x = layers.Flatten()(x)
x = layers.Dropout(rate=0.5)(x)
x = layers.Dense(2048, activation='relu',
kernel_initializer=DENSE_KERNEL_INITIALIZER)(x)
x = layers.Dropout(rate=0.5)(x)
x = layers.Dense(2048, activation='relu',
kernel_initializer=DENSE_KERNEL_INITIALIZER)(x)
x = layers.Dropout(rate=0.5)(x)
x = layers.Dense(num_classes, activation='relu',
kernel_initializer=DENSE_KERNEL_INITIALIZER)(x)
output = layers.Softmax()(x)

model = Model(inputs=input_image, outputs=output)
return model


def make_feature(cfg):
feature_layers = []
for v in cfg:
if v == 'M':
feature_layers.append(layers.MaxPool2D(pool_size=2, strides=2))
else:
feature_layers.append(layers.Conv2D(v, kernel_size=3, padding="same", activation="relu",
kernel_initializer=CONV_KERNEL_INITIALIZER))
return Sequential(feature_layers, name="feature")


cfgs = {
'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


def vgg(model_name="vgg16", im_height=224, im_width=224, num_classes=1000):
cfg = cfgs[model_name]
model = VGG(make_feature(cfg), im_height=im_height, im_width=im_width, num_classes=num_classes)
return model


# input = tf.random.uniform((4, 224, 224, 3))
# vggnet = vgg("vgg16", num_classes=5)
# print(vggnet.summary())
# print(vggnet(input))