ResNet+FPN代码详解

ResNet101+FPN代码详解

ResNet结构

FPN代码

FPN论文

1.ResNet

  • resnet101结构:

resnet101代码:

1
2
3
4
5
6
7
8
9
10
def resnet101(pretrained=False):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
# [3, 4, 23, 3]即为res101的各个block的重复次数
model = ResNet(Bottleneck, [3, 4, 23, 3])
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change
self.layer1 = self._make_layer(block, 64, layers[0])#3
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)#4
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)#23
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)#3
# it is slightly better whereas slower to set stride = 1
# self.layer4 = self._make_layer(block, 512, layers[3], stride=1)
self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)

x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)

x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)

return x
  • 卷积层和BN层的初始化:
1
2
3
4
5
6
7
8
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
#//卷积层的初始化采用正态分布
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
  • 每个block结构:

block代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride

def forward(self, x):#im_data:[[1,896,600,3]]
residual = x#[1, 64, 224, 150]

out = self.conv1(x)#[1, 64, 224, 150])
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)#[1, 64, 224, 150])
out = self.bn2(out)
out = self.relu(out)

out = self.conv3(out)#[1, 256, 224, 150]
out = self.bn3(out)

if self.downsample is not None:
residual = self.downsample(x)#[1, 256, 224, 150]
#element-wise add
out += residual
out = self.relu(out)

return out

resnet101设置:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class resnet(_fasterRCNN):
def __init__(self, classes, num_layers=101, pretrained=False, class_agnostic=False):
self.model_path = 'data/pretrained_model/resnet101_caffe.pth'
self.dout_base_model = 1024
self.pretrained = pretrained
self.class_agnostic = class_agnostic

_fasterRCNN.__init__(self, classes, class_agnostic)

def _init_modules(self):
#本代码使用res101网络设置
resnet = resnet101()

if self.pretrained == True:
print("Loading pretrained weights from %s" %(self.model_path))
state_dict = torch.load(self.model_path)
resnet.load_state_dict({k:v for k,v in state_dict.items() if k in resnet.state_dict()})

# Build resnet.即conv1,layer1,layer2,layer3
self.RCNN_base = nn.Sequential(resnet.conv1, resnet.bn1,resnet.relu,
resnet.maxpool,resnet.layer1,resnet.layer2,resnet.layer3)
#即layer4
self.RCNN_top = nn.Sequential(resnet.layer4)

self.RCNN_cls_score = nn.Linear(2048, self.n_classes)
if self.class_agnostic:
self.RCNN_bbox_pred = nn.Linear(2048, 4)
else:
self.RCNN_bbox_pred = nn.Linear(2048, 4 * self.n_classes)

# Fix blocks
#固定conv1的参数
for p in self.RCNN_base[0].parameters(): p.requires_grad=False
for p in self.RCNN_base[1].parameters(): p.requires_grad=False

assert (0 <= cfg.RESNET.FIXED_BLOCKS < 4)
if cfg.RESNET.FIXED_BLOCKS >= 3:
for p in self.RCNN_base[6].parameters(): p.requires_grad=False
if cfg.RESNET.FIXED_BLOCKS >= 2:
for p in self.RCNN_base[5].parameters(): p.requires_grad=False
#cfg.RESNET.FIXED_BLOCKS=1,所以layer1参数固定,
#训练时layer2,layer3,layer4参数微调
if cfg.RESNET.FIXED_BLOCKS >= 1:
for p in self.RCNN_base[4].parameters(): p.requires_grad=False

def set_bn_fix(m):
classname = m.__class__.__name__
if classname.find('BatchNorm') != -1:
for p in m.parameters(): p.requires_grad=False
#设置BN层的参数固定
self.RCNN_base.apply(set_bn_fix)
self.RCNN_top.apply(set_bn_fix)

Faster RCNN利用了conv1到conv4-x的91层作为共享卷积层(即代码中的conv1,layer1,layer2,layer3)。从conv4-x开始分叉,一路经过RPN进行proposal选择,另一路送入ROIPooling层,把RPN结果输入ROI Pooling层,变成$7\times7$的特征。然后所有的输出经过con5-x计算,这里conv5-x起到全连接层fc的作用,然后经过分类器和边界框回归得到最终结果。整体框架如下:

对应代码在/faster-rcnn.pytorch/lib/model/faster_rcnn/faster_rcnn.py中,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
class _fasterRCNN(nn.Module):
""" faster RCNN """
def __init__(self, classes, class_agnostic):
super(_fasterRCNN, self).__init__()
self.classes = classes
self.n_classes = len(classes)
self.class_agnostic = class_agnostic
# loss
self.RCNN_loss_cls = 0
self.RCNN_loss_bbox = 0

# define rpn
self.RCNN_rpn = _RPN(self.dout_base_model)
self.RCNN_proposal_target = _ProposalTargetLayer(self.n_classes)
self.RCNN_roi_pool = _RoIPooling(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0)
self.RCNN_roi_align = RoIAlignAvg(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0)

self.grid_size = cfg.POOLING_SIZE * 2 if cfg.CROP_RESIZE_WITH_MAX_POOL else cfg.POOLING_SIZE#7
self.RCNN_roi_crop = _RoICrop()

def forward(self, im_data, im_info, gt_boxes, num_boxes):
batch_size = im_data.size(0)#1

im_info = im_info.data#[896,600,1.79]
gt_boxes = gt_boxes.data#[1,1,5]
num_boxes = num_boxes.data

# feed image data to base model to obtain base feature map
#此处base feature map即为conv1,layer1,layer2,layer3输出的特征图。
#self.RCNN_base = nn.Sequential(resnet.conv1, esnet.bn1,resnet.relu,
#resnet.maxpool,resnet.layer1,resnet.layer2,resnet.layer3)
base_feat = self.RCNN_base(im_data)#class resnet

# feed base feature map tp RPN to obtain rois
rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes)

# if it is training phrase, then use ground trubut bboxes for refining
if self.training:
roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes)
rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data

rois_label = Variable(rois_label.view(-1).long())
rois_target = Variable(rois_target.view(-1, rois_target.size(2)))
rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2)))
rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2)))
else:
rois_label = None
rois_target = None
rois_inside_ws = None
rois_outside_ws = None
rpn_loss_cls = 0
rpn_loss_bbox = 0

rois = Variable(rois)
# do roi pooling based on predicted rois

if cfg.POOLING_MODE == 'crop':
# pdb.set_trace()
# pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5))
grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size)
grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous()
pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach())
if cfg.CROP_RESIZE_WITH_MAX_POOL:
pooled_feat = F.max_pool2d(pooled_feat, 2, 2)
elif cfg.POOLING_MODE == 'align':
pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5))
elif cfg.POOLING_MODE == 'pool':
pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1,5))

# feed pooled features to top model
pooled_feat = self._head_to_tail(pooled_feat)

# compute bbox offset
bbox_pred = self.RCNN_bbox_pred(pooled_feat)
if self.training and not self.class_agnostic:
# select the corresponding columns according to roi labels
bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4)
bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4))
bbox_pred = bbox_pred_select.squeeze(1)

# compute object classification probability
cls_score = self.RCNN_cls_score(pooled_feat)
cls_prob = F.softmax(cls_score)

RCNN_loss_cls = 0
RCNN_loss_bbox = 0

if self.training:
# classification loss
RCNN_loss_cls = F.cross_entropy(cls_score, rois_label)

# bounding box regression L1 loss
RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws)


cls_prob = cls_prob.view(batch_size, rois.size(1), -1)
bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1)

return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label

2.FPN

Bottom-up pathway

自下而上的通路是卷积网络backbone的前向计算,每个尺度的步长是2.对于resnet来说,我们使用最后的残差块的输出${C_2,C_3,C_4,C_5}$,对应于conv2,conv3,conv4,conv5的输出。每层的步长分别为${4,8,16,32}$.

Top-down pathway and lateral connections

如何结合高低层的语义信息呢?就是:把更抽象、语义更强的高层特征图进行上采样,然后吧特征横向连接至前一层特征,因此得到加强的高层特征。自顶向下的过程采用了上采样进行,而横向连接则是将上采样的结果和自底向上生成的相同大小的feature map进行融合。如下图,这里$1\times1$的卷积核作用就是减少通道数,通过element-wise 相加。重复迭代过程,直至产生最精细的特征图。

金字塔中的所有层级共享分类、回归层。因此作者固定了所有特征图的维度,即通道数,表示为d.$d=256$.因此所有的额外的卷积层(比如P2)具有256维的输出。

在fasterRCNN中的应用

RPN

原本的RPN实现中,有一个$3\times3$的窗口在特征图上滑动,执行二分类和边界框回归。我们把$3\times3$的卷积层和后面接着的$1\times1$的用来分类和回归的卷积层叫做head,。是不是目标的标准和边界框回归的target是由anchor定义的。

FPN应用到RPN中时,我们调整如下:使用FPN代替了单一尺度的RPN。在每个特征金字塔层都加上刚刚说的$3\times3$和$1\times1$的head.因为head网络在每个位置都滑动,所以单独层没必要执行多尺度。

最后的定义:原本RPN中,anchor是在$13\times13$的特征图应用9中不同的尺度anchor,而本文,在多尺度的特征图上,应用每层固定scale的anchor尺寸。FPN中anchor在${P_2,P_3,P_4,P_5,P_6}$上分别对应尺度:${32^2,64^2,128^2,256^2,512^2},$ 每层使用的纵横比为{1:2,1:1,2:1}。最后特征金字塔共有15种anchor。

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
class _RPN_FPN(nn.Module):
def __init__(self, din):
super(_RPN_FPN, self).__init__()

self.din = din # get depth of input feature map, e.g., 512
self.anchor_ratios = cfg.ANCHOR_RATIOS
self.anchor_scales = cfg.ANCHOR_SCALES
self.feat_stride = cfg.FEAT_STRIDE[0]

# define the convrelu layers processing input feature map
self.RPN_Conv = nn.Conv2d(self.din, 512, 3, 1, 1, bias=True)#din=256

# define bg/fg classifcation score layer
# self.nc_score_out = len(self.anchor_scales) * len(self.anchor_ratios) * 2 # 2(bg/fg) * 9 (anchors)
self.nc_score_out = 1 * len(self.anchor_ratios) * 2 # 2(bg/fg) * 3 (anchor ratios) * 1 (anchor scale)
self.RPN_cls_score = nn.Conv2d(512, self.nc_score_out, 1, 1, 0)

# define anchor box offset prediction layer
# self.nc_bbox_out = len(self.anchor_scales) * len(self.anchor_ratios) * 4 # 4(coords) * 9 (anchors)
self.nc_bbox_out = 1 * len(self.anchor_ratios) * 4 # 4(coords) * 3 (anchors) * 1 (anchor scale)
self.RPN_bbox_pred = nn.Conv2d(512, self.nc_bbox_out, 1, 1, 0)

# define proposal layer
self.RPN_proposal = _ProposalLayer_FPN(self.feat_stride, self.anchor_scales, self.anchor_ratios)

# define anchor target layer
self.RPN_anchor_target = _AnchorTargetLayer_FPN(self.feat_stride, self.anchor_scales, self.anchor_ratios)

self.rpn_loss_cls = 0
self.rpn_loss_box = 0

def forward(self, rpn_feature_maps, im_info, gt_boxes, num_boxes):


n_feat_maps = len(rpn_feature_maps)#5

rpn_cls_scores = []
rpn_cls_probs = []
rpn_bbox_preds = []
rpn_shapes = []
#将p2,p3,p4,p5,p6层上的预测的分类分数和坐标偏移都计算出来,然后拼接起来
#在全部送入到proposal层。
for i in range(n_feat_maps):
feat_map = rpn_feature_maps[i]
batch_size = feat_map.size(0)

# return feature map after convrelu layer
rpn_conv1 = F.relu(self.RPN_Conv(feat_map), inplace=True)
# get rpn classification score
rpn_cls_score = self.RPN_cls_score(rpn_conv1)

rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2)
rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape)
rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out)

# get rpn offsets to the anchor boxes
rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1)

rpn_shapes.append([rpn_cls_score.size()[2], rpn_cls_score.size()[3]])
rpn_cls_scores.append(rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2))
rpn_cls_probs.append(rpn_cls_prob_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2))
rpn_bbox_preds.append(rpn_bbox_pred.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4))

rpn_cls_score_alls = torch.cat(rpn_cls_scores, 1)#([1, 95400, 2])
rpn_cls_prob_alls = torch.cat(rpn_cls_probs, 1)
rpn_bbox_pred_alls = torch.cat(rpn_bbox_preds, 1)

n_rpn_pred = rpn_cls_score_alls.size(1)

# proposal layer
cfg_key = 'TRAIN' if self.training else 'TEST'

rois = self.RPN_proposal((rpn_cls_prob_alls.data, rpn_bbox_pred_alls.data,
im_info, cfg_key, rpn_shapes))

self.rpn_loss_cls = 0
self.rpn_loss_box = 0

Fast RCNN

1.特征图

在ROI Pooling时,输入为ROI和对应的特征图,因此需要对不同层级的金字塔定制不同尺度的ROI. 即需要对不同尺度的ROI确定应该使用哪一层的特征图作为ROI Pooling层的输入。因此定义了一个系数:
$$
k=[k_0+log_2(\sqrt{wh}/224)].\qquad(1)
$$
例如:ImageNet的标准输入是224,$k_0$基准设置为5,代表$P_5$层的输出。加入ROI尺寸为$w=112,h=112$,那么$k=5-1=4$.意味着该ROI应该使用$P_4$特征图作为输入。同时使用取整处理。

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#这个函数是确定某个ROI应该使用哪一层特征图进行ROIPooling的。
def _PyramidRoI_Feat(self, feat_maps, rois, im_info):
''' roi pool on pyramid feature maps'''
# do roi pooling based on predicted rois
#计算原图面积
img_area = im_info[0][0] * im_info[0][1]
#得到图像宽高
h = rois.data[:, 4] - rois.data[:, 2] + 1
w = rois.data[:, 3] - rois.data[:, 1] + 1
# 分两步计算每个 ROI 框需要在哪个层的特征图中进行 pooling
#对应公式1
roi_level = torch.log(torch.sqrt(h * w) / 224.0)
roi_level = torch.round(roi_level + 4)

#规范roi level的范围为2:5之间
roi_level[roi_level < 2] = 2
roi_level[roi_level > 5] = 5
# roi_level.fill_(5)

#然后进行roi pooling/crop/align,这个版本的代码定义了参数可以使用三种方式进行roi的特征提取。
if cfg.POOLING_MODE == 'crop':
……………………………………
elif cfg.POOLING_MODE == 'align':
………………………………
elif cfg.POOLING_MODE == 'pool':
…………………………………………
return roi_pool_feat

2.全连接

Fast RCNN论文中,head是指特定类别的分类器和回归器

ResNet论文中,作者使用conv5作为Fast RCNN的head,对于不同level,这些head的参数共享。

FPN中,已经使用了conv5来构建特征金字塔,因此,作者加了两个隐含的1024维的全连接层,在分类层和回归层之前。

完整结构:

FPN代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
class _FPN(nn.Module):
""" FPN """
def __init__(self, classes, class_agnostic):
super(_FPN, self).__init__()
self.classes = classes
self.n_classes = len(classes)
self.class_agnostic = class_agnostic
# loss
self.RCNN_loss_cls = 0
self.RCNN_loss_bbox = 0

self.maxpool2d = nn.MaxPool2d(1, stride=2)
# define rpn
self.RCNN_rpn = _RPN_FPN(self.dout_base_model)
self.RCNN_proposal_target = _ProposalTargetLayer(self.n_classes)

# NOTE: the original paper used pool_size = 7 for cls branch, and 14 for mask branch, to save the
# computation time, we first use 14 as the pool_size, and then do stride=2 pooling for cls branch.
self.RCNN_roi_pool = _RoIPooling(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0)
self.RCNN_roi_align = RoIAlignAvg(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0)
self.grid_size = cfg.POOLING_SIZE * 2 if cfg.CROP_RESIZE_WITH_MAX_POOL else cfg.POOLING_SIZE
self.RCNN_roi_crop = _RoICrop()

#参数初始化
def _init_weights(self):
def normal_init(m, mean, stddev, truncated=False):
"""
weight initalizer: truncated normal and random normal.
"""
# x is a parameter
………………………………

# custom weights initialization called on netG and netD
def weights_init(m, mean, stddev, truncated=False):
…………………………

def create_architecture(self):
self._init_modules()
self._init_weights()

def _upsample_add(self, x, y):
'''Upsample and add two feature maps.
Args:
x: (Variable) top feature map to be upsampled.
y: (Variable) lateral feature map.
Returns:
(Variable) added feature map.
Note in PyTorch, when input size is odd, the upsampled feature map
with `F.upsample(..., scale_factor=2, mode='nearest')`
maybe not equal to the lateral feature map size.
e.g.
original input size: [N,_,15,15] ->
conv2d feature map size: [N,_,8,8] ->
upsampled feature map size: [N,_,16,16]
So we choose bilinear upsample which supports arbitrary output sizes.
'''
#这里说了上采样可能会和前一层的特征图的尺寸不一致的情况,比如,输入为【15,15】,输出特征图为【8,8】,但是上采样后的特征图为【16,16】。所以我们采用双线性上采样,这样可以支持任意的输出尺寸。
_,_,H,W = y.size()
return F.upsample(x, size=(H,W), mode='bilinear') + y



def forward(self, im_data, im_info, gt_boxes, num_boxes):
batch_size = im_data.size(0)

im_info = im_info.data
gt_boxes = gt_boxes.data
num_boxes = num_boxes.data

#以下的就和FPN结构一样,先自底向上,再自顶向下,并横向连接,
# feed image data to base model to obtain base feature map
# Bottom-up
c1 = self.RCNN_layer0(im_data)#im_data:[1,3,850,600],c1[1,64,212,150]
c2 = self.RCNN_layer1(c1)#c2[1, 256, 212, 150]
c3 = self.RCNN_layer2(c2)#c3[1, 512, 106, 75]
c4 = self.RCNN_layer3(c3)#c4[1, 1024, 53, 38]
c5 = self.RCNN_layer4(c4)#c5[1, 2048, 53, 38]
# Top-down
p5 = self.RCNN_toplayer(c5)#p5[1,256,53,38]
p4 = self._upsample_add(p5, self.RCNN_latlayer1(c4))#p4([1, 256, 53, 38])
p4 = self.RCNN_smooth1(p4)
p3 = self._upsample_add(p4, self.RCNN_latlayer2(c3))
p3 = self.RCNN_smooth2(p3)#p3([1, 256, 106, 75])
p2 = self._upsample_add(p3, self.RCNN_latlayer3(c2))#p2[1, 256, 212, 150])
p2 = self.RCNN_smooth3(p2)

p6 = self.maxpool2d(p5)#p6[1, 256, 27, 19])

rpn_feature_maps = [p2, p3, p4, p5, p6]
mrcnn_feature_maps = [p2, p3, p4, p5]

rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(rpn_feature_maps, im_info, gt_boxes, num_boxes)

# if it is training phrase, then use ground trubut bboxes for refining
………………

# pooling features based on rois, output 14x14 map
roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info)

# feed pooled features to top model
pooled_feat = self._head_to_tail(roi_pool_feat)


# compute bbox offset
bbox_pred = self.RCNN_bbox_pred(pooled_feat)


# compute object classification probability
cls_score = self.RCNN_cls_score(pooled_feat)
cls_prob = F.softmax(cls_score)

RCNN_loss_cls = 0
RCNN_loss_bbox = 0

…………………………

return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label

注意:代码很长,只截取了部分能够表示FPN结构的,大部分细节被删掉了,详细见源码.这个版本的代码还有点问题,训练出来的模型的精度竟然比faster rcnn的还低一丢丢!作者说:他正在调bug……

Reference

1.FPN论文阅读笔记

2.FPN代码细节