禁止转载,侵权必究

前言

上一章我们从原理上讲解了目标检测算法。通过开发者们的努力,我们可以用一步就直接实现目标检测,中间有一个关键点就是怎么关联特征图和预测框。我们还讲了预测过程中的P0特征图中的每个值的含义,最后我们可以得到objectness,location,classification的预测结果并与训练标注计算loss,通过神经网络训练,minimize这个loss最后得到一个较好的目标检测模型。

目标检测算法编码

数据读取

# 使用paddle.reader.xmap_readers实现多线程读取数据
def multithread_loader(datadir, batch_size= 10, mode='train'):
    cname2cid = get_insect_names()
    records = get_annotations(cname2cid, datadir)
    def reader():
        if mode == 'train':
            np.random.shuffle(records)
        img_size = get_img_size(mode)
        batch_data = []
        for record in records:
            batch_data.append((record, img_size))
            if len(batch_data) == batch_size:
                yield batch_data
                batch_data = []
                img_size = get_img_size(mode)
        if len(batch_data) > 0:
            yield batch_data

    def get_data(samples):
        batch_data = []
        for sample in samples:
            record = sample[0]
            img_size = sample[1]
            img, gt_bbox, gt_labels, im_shape = get_img_data(record, size=img_size)
            batch_data.append((img, gt_bbox, gt_labels, im_shape))
        return make_array(batch_data)

    mapper = functools.partial(get_data, )

    return paddle.reader.xmap_readers(mapper, reader, 8, 10)

get_insect_names() 只是简单的文字到数字的映射。get_annotations() 是读取训练图片、转换图片的[x, y, w, h]和真实框:

# 获取昆虫标注信息
def get_annotations(cname2cid, datadir):
    filenames = os.listdir(os.path.join(datadir, 'annotations', 'xmls'))
    records = []
    ct = 0
    for fname in filenames:
        fid = fname.split('.')[0]
        fpath = os.path.join(datadir, 'annotations', 'xmls', fname)
        img_file = os.path.join(datadir, 'images', fid + '.jpeg')
        tree = ET.parse(fpath)

        if tree.find('id') is None:
            im_id = np.array([ct])
        else:
            im_id = np.array([int(tree.find('id').text)])

        objs = tree.findall('object')
        im_w = float(tree.find('size').find('width').text)
        im_h = float(tree.find('size').find('height').text)
        gt_bbox = np.zeros((len(objs), 4), dtype=np.float32)
        gt_class = np.zeros((len(objs), ), dtype=np.int32)
        is_crowd = np.zeros((len(objs), ), dtype=np.int32)
        difficult = np.zeros((len(objs), ), dtype=np.int32)
        for i, obj in enumerate(objs):
            cname = obj.find('name').text
            gt_class[i] = cname2cid[cname]
            _difficult = int(obj.find('difficult').text)
            x1 = float(obj.find('bndbox').find('xmin').text)
            y1 = float(obj.find('bndbox').find('ymin').text)
            x2 = float(obj.find('bndbox').find('xmax').text)
            y2 = float(obj.find('bndbox').find('ymax').text)
            x1 = max(0, x1)
            y1 = max(0, y1)
            x2 = min(im_w - 1, x2)
            y2 = min(im_h - 1, y2)
            # 这里使用xywh格式来表示目标物体真实框
            gt_bbox[i] = [(x1+x2)/2.0 , (y1+y2)/2.0, x2-x1+1., y2-y1+1.]
            is_crowd[i] = 0
            difficult[i] = _difficult

        voc_rec = {
            'im_file': img_file,
            'im_id': im_id,
            'h': im_h,
            'w': im_w,
            'is_crowd': is_crowd,
            'gt_class': gt_class,
            'gt_bbox': gt_bbox,
            'gt_poly': [],
            'difficult': difficult
            }
        if len(objs) != 0:
            records.append(voc_rec)
        ct += 1
    return records

定义YOLOv3模型

# 定义YOLO-V3模型
    def forward(self, inputs):
        outputs = []
        blocks = self.block(inputs)
        for i, block in enumerate(blocks):
            if i > 0:
                # 将r_{i-1}经过卷积和上采样之后得到特征图,与这一级的ci进行拼接
                block = fluid.layers.concat(input=[route, block], axis=1)
            # 从ci生成ti和ri
            route, tip = self.yolo_blocks[i](block)
            # 从ti生成pi
            block_out = self.block_outputs[i](tip)
            # 将pi放入列表
            outputs.append(block_out)

            if i < 2:
                # 对ri进行卷积调整通道数
                route = self.route_blocks_2[i](route)
                # 对ri进行放大,使其尺寸和c_{i+1}保持一致
                route = self.upsample(route)

        return outputs

这段代码的backbone用Darknet53改造而成,同时输出3个中间卷积层结果(C0,C1,C2)。

    def forward(self,inputs):
        out = self.conv0(inputs)
        #print("conv1:",out.numpy())
        out = self.downsample0(out)
        #print("dy:",out.numpy())
        blocks = []
        for i, conv_block_i in enumerate(self.darknet53_conv_block_list): #依次将各个层级作用在输入上面
            out = conv_block_i(out)
            blocks.append(out)
            if i < len(self.stages) - 1:
                out = self.downsample_list[i](out)
        return blocks[-1:-4:-1] # 将C0, C1, C2作为返回值

上面代码是backbone中提取特征图(C0,C1,C2)代码。

用backbone处理之后除了结果C0外,用Ri去concat Ci。再用yolo_blocks和一层卷积(block_outputs)去求解(P0,P1,P2)。YoloDetectionBlock在上一章已经讲过了,就不在赘述了。

训练模型

for i, data in enumerate(train_loader()):
    img, gt_boxes, gt_labels, img_scale = data
    gt_scores = np.ones(gt_labels.shape).astype('float32')
    gt_scores = to_variable(gt_scores)
    img = to_variable(img)
    gt_boxes = to_variable(gt_boxes)
    gt_labels = to_variable(gt_labels)
    outputs = model(img)
    loss=model.get_loss(outputs, gt_boxes, gt_labels, 
                        gtscore=gt_scores,
                        anchors = ANCHORS,
                        anchor_masks = ANCHOR_MASKS,
                        ignore_thresh=IGNORE_THRESH,
                        use_label_smooth=False)
    loss.backward()
    opt.minimize(loss)
    model.clear_gradients()
    if i % 1 == 0:
        timestring = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
        print('{}[TRAIN]epoch {}, iter {}, output loss: {}'.format(timestring, epoch, i, loss.numpy()))

因为模型的定义前面已经讲了,这里比较重要的是get_loss函数

def get_loss(self, outputs, gtbox, gtlabel, gtscore=None,
             anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326],
             anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]],
             ignore_thresh=0.7,
             use_label_smooth=False):
    """
    使用fluid.layers.yolov3_loss,直接计算损失函数,过程更简洁,速度也更快
    """
    self.losses = []
    downsample = 32
    for i, out in enumerate(outputs): # 对三个层级分别求损失函数
        anchor_mask_i = anchor_masks[i]
        loss = fluid.layers.yolov3_loss(
                x=out,  # out是P0, P1, P2中的一个
                gt_box=gtbox,  # 真实框坐标
                gt_label=gtlabel,  # 真实框类别
                gt_score=gtscore,  # 真实框得分,使用mixup训练技巧时需要,不使用该技巧时直接设置为1,形状与gtlabel相同
                anchors=anchors,   # 锚框尺寸,包含[w0, h0, w1, h1, ..., w8, h8]共9个锚框的尺寸
                anchor_mask=anchor_mask_i, # 筛选锚框的mask,例如anchor_mask_i=[3, 4, 5],将anchors中第3、4、5个锚框挑选出来给该层级使用
                class_num=self.num_classes, # 分类类别数
                ignore_thresh=ignore_thresh, # 当预测框与真实框IoU > ignore_thresh,标注objectness = -1
                downsample_ratio=downsample, # 特征图相对于原图缩小的倍数,例如P0是32, P1是16,P2是8
                use_label_smooth=False)      # 使用label_smooth训练技巧时会用到,这里没用此技巧,直接设置为False
        self.losses.append(fluid.layers.reduce_mean(loss))  #reduce_mean对每张图片求和
        downsample = downsample // 2 # 下一级特征图的缩放倍数会减半
    return sum(self.losses) # 对每个层级求和

其中AB共9个,(C0, C1, C2)每个三个。IoU阈值是0.7,超过这个阈值且不是最大的AB会标记objectness为-1。这里直接用了飞桨的函数yolov3_loss来计算损失函数。最后对三层(C0, C1, C2)的loss加和得到最终的loss值。

测试模型

for i, data in enumerate(test_loader()):
    img_name, img_data, img_scale_data = data
    img = to_variable(img_data)
    img_scale = to_variable(img_scale_data)
    
    outputs = model.forward(img)
    bboxes, scores = model.get_pred(outputs,
                             im_shape=img_scale,
                             anchors=ANCHORS,
                             anchor_masks=ANCHOR_MASKS,
                             valid_thresh = VALID_THRESH)

    bboxes_data = bboxes.numpy()
    scores_data = scores.numpy()
    result = multiclass_nms(bboxes_data, scores_data,
                  score_thresh=VALID_THRESH, 
                  nms_thresh=NMS_THRESH, 
                  pre_nms_topk=NMS_TOPK, 
                  pos_nms_topk=NMS_POSK)
    for j in range(len(result)):
        result_j = result[j]
        img_name_j = img_name[j]
        total_results.append([img_name_j, result_j.tolist()])
    print('processed {} pictures'.format(len(total_results)))

从测试集中读取数据和标签,然后用get_pred方法,内部调用了飞桨的yolo_box方法返回形如[N,M,4]的三维张量和形如[N,M,class_num]的三维张量。最后通过multiclass_nms方法在图中找到对应的图的名字和虫子的位置、名字和分类。

模型评估-mAP

我们一般用mAP( mean average precision )来衡量模型的效果。那么mAP究竟是什么呢?

  • True Positive(TP): IoU 大于阈值(一般是0.5)的个数
  • False Positive(FP): IoU 小于阈值的个数
  • False Negative(FN):没有检测到GTB的个数
  • True Negative(TN): –

查准率(Precision):TP/(TP + FP)

查全率(Recall):TP/(TP + FN)

以Precision和Recall分别为[x, y]坐标轴画出的曲线叫PR曲线,曲线下的面积小mAP(mean average precision).

直接使用AI识虫的基础代码calculate_map.py评估我们在测试集上的预测结果的mAP值。

!python calculate_map.py --anno_dir=./insects/val/annotations/xmls --pred_result=./pred_results.json 
经过60个epoch的训练得到的结果:
Accumulating evaluatation results...
mAP(0.50, 11point) = 66.05

调参过程中训练出的模型的各种mAP值:

DecayedAdagradOptimizer lr=0.001 5 epoches mAP 37.06

DecayedAdagradOptimizer lr=0.1 5 epoches 不收敛,飞桨直接报错

DecayedAdagradOptimizer lr=0.01 5 epoches mAP 43.40

DecayedAdagradOptimizer lr=0.01 60 epoches mAP 66.05

DecayedAdagradOptimizer lr=0.01 60 epoches mAP 59.45

保持其他不变,增加随机裁剪
DecayedAdagradOptimizer lr=0.01 60 epoches mAP 72.63

DecayedAdagradOptimizer lr=0.01 60 epoches mAP 74.17