目录
禁止转载,侵权必究
前言
上一章我们从原理上讲解了目标检测算法。通过开发者们的努力,我们可以用一步就直接实现目标检测,中间有一个关键点就是怎么关联特征图和预测框。我们还讲了预测过程中的P0特征图中的每个值的含义,最后我们可以得到objectness,location,classification的预测结果并与训练标注计算loss,通过神经网络训练,minimize这个loss最后得到一个较好的目标检测模型。
目标检测算法编码
数据读取
# 使用paddle.reader.xmap_readers实现多线程读取数据
def multithread_loader(datadir, batch_size= 10, mode='train'):
cname2cid = get_insect_names()
records = get_annotations(cname2cid, datadir)
def reader():
if mode == 'train':
np.random.shuffle(records)
img_size = get_img_size(mode)
batch_data = []
for record in records:
batch_data.append((record, img_size))
if len(batch_data) == batch_size:
yield batch_data
batch_data = []
img_size = get_img_size(mode)
if len(batch_data) > 0:
yield batch_data
def get_data(samples):
batch_data = []
for sample in samples:
record = sample[0]
img_size = sample[1]
img, gt_bbox, gt_labels, im_shape = get_img_data(record, size=img_size)
batch_data.append((img, gt_bbox, gt_labels, im_shape))
return make_array(batch_data)
mapper = functools.partial(get_data, )
return paddle.reader.xmap_readers(mapper, reader, 8, 10)
get_insect_names() 只是简单的文字到数字的映射。get_annotations() 是读取训练图片、转换图片的[x, y, w, h]和真实框:
# 获取昆虫标注信息
def get_annotations(cname2cid, datadir):
filenames = os.listdir(os.path.join(datadir, 'annotations', 'xmls'))
records = []
ct = 0
for fname in filenames:
fid = fname.split('.')[0]
fpath = os.path.join(datadir, 'annotations', 'xmls', fname)
img_file = os.path.join(datadir, 'images', fid + '.jpeg')
tree = ET.parse(fpath)
if tree.find('id') is None:
im_id = np.array([ct])
else:
im_id = np.array([int(tree.find('id').text)])
objs = tree.findall('object')
im_w = float(tree.find('size').find('width').text)
im_h = float(tree.find('size').find('height').text)
gt_bbox = np.zeros((len(objs), 4), dtype=np.float32)
gt_class = np.zeros((len(objs), ), dtype=np.int32)
is_crowd = np.zeros((len(objs), ), dtype=np.int32)
difficult = np.zeros((len(objs), ), dtype=np.int32)
for i, obj in enumerate(objs):
cname = obj.find('name').text
gt_class[i] = cname2cid[cname]
_difficult = int(obj.find('difficult').text)
x1 = float(obj.find('bndbox').find('xmin').text)
y1 = float(obj.find('bndbox').find('ymin').text)
x2 = float(obj.find('bndbox').find('xmax').text)
y2 = float(obj.find('bndbox').find('ymax').text)
x1 = max(0, x1)
y1 = max(0, y1)
x2 = min(im_w - 1, x2)
y2 = min(im_h - 1, y2)
# 这里使用xywh格式来表示目标物体真实框
gt_bbox[i] = [(x1+x2)/2.0 , (y1+y2)/2.0, x2-x1+1., y2-y1+1.]
is_crowd[i] = 0
difficult[i] = _difficult
voc_rec = {
'im_file': img_file,
'im_id': im_id,
'h': im_h,
'w': im_w,
'is_crowd': is_crowd,
'gt_class': gt_class,
'gt_bbox': gt_bbox,
'gt_poly': [],
'difficult': difficult
}
if len(objs) != 0:
records.append(voc_rec)
ct += 1
return records
定义YOLOv3模型
# 定义YOLO-V3模型
def forward(self, inputs):
outputs = []
blocks = self.block(inputs)
for i, block in enumerate(blocks):
if i > 0:
# 将r_{i-1}经过卷积和上采样之后得到特征图,与这一级的ci进行拼接
block = fluid.layers.concat(input=[route, block], axis=1)
# 从ci生成ti和ri
route, tip = self.yolo_blocks[i](block)
# 从ti生成pi
block_out = self.block_outputs[i](tip)
# 将pi放入列表
outputs.append(block_out)
if i < 2:
# 对ri进行卷积调整通道数
route = self.route_blocks_2[i](route)
# 对ri进行放大,使其尺寸和c_{i+1}保持一致
route = self.upsample(route)
return outputs
这段代码的backbone用Darknet53改造而成,同时输出3个中间卷积层结果(C0,C1,C2)。
def forward(self,inputs):
out = self.conv0(inputs)
#print("conv1:",out.numpy())
out = self.downsample0(out)
#print("dy:",out.numpy())
blocks = []
for i, conv_block_i in enumerate(self.darknet53_conv_block_list): #依次将各个层级作用在输入上面
out = conv_block_i(out)
blocks.append(out)
if i < len(self.stages) - 1:
out = self.downsample_list[i](out)
return blocks[-1:-4:-1] # 将C0, C1, C2作为返回值
上面代码是backbone中提取特征图(C0,C1,C2)代码。
用backbone处理之后除了结果C0外,用Ri去concat Ci。再用yolo_blocks和一层卷积(block_outputs)去求解(P0,P1,P2)。YoloDetectionBlock在上一章已经讲过了,就不在赘述了。
训练模型
for i, data in enumerate(train_loader()):
img, gt_boxes, gt_labels, img_scale = data
gt_scores = np.ones(gt_labels.shape).astype('float32')
gt_scores = to_variable(gt_scores)
img = to_variable(img)
gt_boxes = to_variable(gt_boxes)
gt_labels = to_variable(gt_labels)
outputs = model(img)
loss=model.get_loss(outputs, gt_boxes, gt_labels,
gtscore=gt_scores,
anchors = ANCHORS,
anchor_masks = ANCHOR_MASKS,
ignore_thresh=IGNORE_THRESH,
use_label_smooth=False)
loss.backward()
opt.minimize(loss)
model.clear_gradients()
if i % 1 == 0:
timestring = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print('{}[TRAIN]epoch {}, iter {}, output loss: {}'.format(timestring, epoch, i, loss.numpy()))
因为模型的定义前面已经讲了,这里比较重要的是get_loss函数
def get_loss(self, outputs, gtbox, gtlabel, gtscore=None,
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326],
anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]],
ignore_thresh=0.7,
use_label_smooth=False):
"""
使用fluid.layers.yolov3_loss,直接计算损失函数,过程更简洁,速度也更快
"""
self.losses = []
downsample = 32
for i, out in enumerate(outputs): # 对三个层级分别求损失函数
anchor_mask_i = anchor_masks[i]
loss = fluid.layers.yolov3_loss(
x=out, # out是P0, P1, P2中的一个
gt_box=gtbox, # 真实框坐标
gt_label=gtlabel, # 真实框类别
gt_score=gtscore, # 真实框得分,使用mixup训练技巧时需要,不使用该技巧时直接设置为1,形状与gtlabel相同
anchors=anchors, # 锚框尺寸,包含[w0, h0, w1, h1, ..., w8, h8]共9个锚框的尺寸
anchor_mask=anchor_mask_i, # 筛选锚框的mask,例如anchor_mask_i=[3, 4, 5],将anchors中第3、4、5个锚框挑选出来给该层级使用
class_num=self.num_classes, # 分类类别数
ignore_thresh=ignore_thresh, # 当预测框与真实框IoU > ignore_thresh,标注objectness = -1
downsample_ratio=downsample, # 特征图相对于原图缩小的倍数,例如P0是32, P1是16,P2是8
use_label_smooth=False) # 使用label_smooth训练技巧时会用到,这里没用此技巧,直接设置为False
self.losses.append(fluid.layers.reduce_mean(loss)) #reduce_mean对每张图片求和
downsample = downsample // 2 # 下一级特征图的缩放倍数会减半
return sum(self.losses) # 对每个层级求和
其中AB共9个,(C0, C1, C2)每个三个。IoU阈值是0.7,超过这个阈值且不是最大的AB会标记objectness为-1。这里直接用了飞桨的函数yolov3_loss来计算损失函数。最后对三层(C0, C1, C2)的loss加和得到最终的loss值。
测试模型
for i, data in enumerate(test_loader()):
img_name, img_data, img_scale_data = data
img = to_variable(img_data)
img_scale = to_variable(img_scale_data)
outputs = model.forward(img)
bboxes, scores = model.get_pred(outputs,
im_shape=img_scale,
anchors=ANCHORS,
anchor_masks=ANCHOR_MASKS,
valid_thresh = VALID_THRESH)
bboxes_data = bboxes.numpy()
scores_data = scores.numpy()
result = multiclass_nms(bboxes_data, scores_data,
score_thresh=VALID_THRESH,
nms_thresh=NMS_THRESH,
pre_nms_topk=NMS_TOPK,
pos_nms_topk=NMS_POSK)
for j in range(len(result)):
result_j = result[j]
img_name_j = img_name[j]
total_results.append([img_name_j, result_j.tolist()])
print('processed {} pictures'.format(len(total_results)))
从测试集中读取数据和标签,然后用get_pred方法,内部调用了飞桨的yolo_box方法返回形如[N,M,4]的三维张量和形如[N,M,class_num]的三维张量。最后通过multiclass_nms方法在图中找到对应的图的名字和虫子的位置、名字和分类。
模型评估-mAP
我们一般用mAP( mean average precision )来衡量模型的效果。那么mAP究竟是什么呢?
- True Positive(TP): IoU 大于阈值(一般是0.5)的个数
- False Positive(FP): IoU 小于阈值的个数
- False Negative(FN):没有检测到GTB的个数
- True Negative(TN): –
查准率(Precision):TP/(TP + FP)
查全率(Recall):TP/(TP + FN)
以Precision和Recall分别为[x, y]坐标轴画出的曲线叫PR曲线,曲线下的面积小mAP(mean average precision).
直接使用AI识虫的基础代码calculate_map.py评估我们在测试集上的预测结果的mAP值。
!python calculate_map.py --anno_dir=./insects/val/annotations/xmls --pred_result=./pred_results.json
经过60个epoch的训练得到的结果: Accumulating evaluatation results... mAP(0.50, 11point) = 66.05
调参过程中训练出的模型的各种mAP值:
DecayedAdagradOptimizer lr=0.001 5 epoches mAP 37.06
DecayedAdagradOptimizer lr=0.1 5 epoches 不收敛,飞桨直接报错
DecayedAdagradOptimizer lr=0.01 5 epoches mAP 43.40
DecayedAdagradOptimizer lr=0.01 60 epoches mAP 66.05
DecayedAdagradOptimizer lr=0.01 60 epoches mAP 59.45
保持其他不变,增加随机裁剪
DecayedAdagradOptimizer lr=0.01 60 epoches mAP 72.63
DecayedAdagradOptimizer lr=0.01 60 epoches mAP 74.17