前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >Caffe2 - (三十二) Detectron 之 roi_data - 模型 minibatch blobs

Caffe2 - (三十二) Detectron 之 roi_data - 模型 minibatch blobs

作者头像
AIHGF
发布2018-05-17 10:19:34
1.2K0
发布2018-05-17 10:19:34
举报
文章被收录于专栏:AIUAIAIUAIAIUAI

Caffe2 - (三十二) Detectron 之 roi_data - 模型 minibatch blobs

根据对应的 roi_data 模块可以处理 对应模型的 minibatch blobs.

  • fast_rcnn.py
  • mask_rcnn.py
  • keypoint_rcnn.py
  • rpn.py
  • retinanet.py

1. fast_rcnn.py

构建用于 Fast R-CNN 训练的 minibatches.

"""
处理 Fast R-CNN 所涉及的 minibatch blobs.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import numpy as np
import numpy.random as npr

from core.config import cfg
import modeling.FPN as fpn
import roi_data.keypoint_rcnn
import roi_data.mask_rcnn
import utils.blob as blob_utils
import utils.boxes as box_utils

logger = logging.getLogger(__name__)


def get_fast_rcnn_blob_names(is_training=True):
    """
    Fast R-CNN blob names.
    """
    """
    rois blob: R 个 RoIs(regions of interest),
              每个 blob 是 5-tuple:(batch_idx, x1, y1, x2, y2),
                 - batch_idx: 图片 batch index
                 - (x1, y1, x2, y2):矩形框
    """
    blob_names = ['rois']
    if is_training:
        # labels_int32 blob: 
        #   R categorical labels in [0, ..., K] for K foreground classes plus background
        #   K 个前景类 + 1 个背景类.
        blob_names += ['labels_int32']
    if is_training:
        # bbox_targets blob: 
        #   R bounding-box regression targets with 4 targets per class
        blob_names += ['bbox_targets']
        # bbox_inside_weights blob: 
        #   每个 roi 最多 4 个 targets 被激活,该二值向量表示了激活 targets 的subset.
        blob_names += ['bbox_inside_weights']
        blob_names += ['bbox_outside_weights']
    if is_training and cfg.MODEL.MASK_ON:
        # 'mask_rois': 
        #   训练 mask 预测分支所采样的 RoIs
        #   Shape is (#masks, 5) in format (batch_idx, x1, y1, x2, y2).
        blob_names += ['mask_rois']
        # 'roi_has_mask':  
        #   rois 中指定的 RoIs 的二值标签(binart labels),表示每个 RoI 是否有 mask.
        #   注:某些情况, *bg* RoI 会有一个值都为 -1(ignore) 的 mask,此时,没有 fg RoIs 可采样.
        #       Shape is (batchsize).
        blob_names += ['roi_has_mask_int32']
        # 'masks_int32':
        #   'mask_rois' 中指定的 RoIs的二值masks.
        #   Shape is (#fg, M * M) where M is the ground truth mask size.
        blob_names += ['masks_int32']
    if is_training and cfg.MODEL.KEYPOINTS_ON:
        # 'keypoint_rois': 
        #   训练 keypoint 预测分支所采样的 RoIs
        #   Shape is (#instances, 5) in format (batch_idx, x1, y1, x2, y2).
        blob_names += ['keypoint_rois']
        # 'keypoint_locations_int32': 
        #   KRCNN.HEATMAP_SIZE**2 大小的 array 中 keypoint 的索引index.
        #   Shape is (#instances). Used in SoftmaxWithLoss.
        blob_names += ['keypoint_locations_int32']
        # 'keypoint_weights': 
        #   'keypoint_locations_int32' 中每个 target 的权重weight
        #   Shape is (#instances). Used in SoftmaxWithLoss.
        blob_names += ['keypoint_weights']
        # 'keypoint_loss_normalizer': 
        #   可选参数,如果 cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS = False,
        #           使用归一化因子.
        blob_names += ['keypoint_loss_normalizer']
    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
        """
        支持 FPN multi-level rois without bbox reg isn't implemented (... and may never be implemented)
        """
        k_max = cfg.FPN.ROI_MAX_LEVEL
        k_min = cfg.FPN.ROI_MIN_LEVEL
        # Same format as rois blob, but one per FPN level
        for lvl in range(k_min, k_max + 1):
            blob_names += ['rois_fpn' + str(lvl)]
        blob_names += ['rois_idx_restore_int32']
        if is_training:
            if cfg.MODEL.MASK_ON:
                for lvl in range(k_min, k_max + 1):
                    blob_names += ['mask_rois_fpn' + str(lvl)]
                blob_names += ['mask_rois_idx_restore_int32']
            if cfg.MODEL.KEYPOINTS_ON:
                for lvl in range(k_min, k_max + 1):
                    blob_names += ['keypoint_rois_fpn' + str(lvl)]
                blob_names += ['keypoint_rois_idx_restore_int32']
    return blob_names


def add_fast_rcnn_blobs(blobs, im_scales, roidb):
    """
    添加 blobs ,用于训练 Fast R-CNN style models.
    """
    # 从每张图片采样训练 RoIs,并添加到 blob 列表lists
    for im_i, entry in enumerate(roidb):
        frcn_blobs = _sample_rois(entry, im_scales[im_i], im_i)
        for k, v in frcn_blobs.items():
            blobs[k].append(v)
    # 将 blob lists 连接为 tensors
    for k, v in blobs.items():
        if isinstance(v, list) and len(v) > 0:
            blobs[k] = np.concatenate(v)
    # 添加 FPN multilevel training RoIs, if configured
    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
        _add_multilevel_rois(blobs)

    # 在处理完所有的 minibatch 图片后,进行安全性检查.
    valid = True
    if cfg.MODEL.KEYPOINTS_ON:
        valid = roi_data.keypoint_rcnn.finalize_keypoint_minibatch(blobs, valid)

    return valid


def _sample_rois(roidb, im_scale, batch_idx):
    """
    生成由 foreground 和 background 样本组成的 RoIs 的随机采样.
    """
    rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
    fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image))
    max_overlaps = roidb['max_overlaps']

    # 选择 foreground RoIs,overlap >= FG_THRESH 的
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
    # 避免出现的情况:
    #   图片中的 foreground RoIs 的数量小于 fg_rois_per_image
    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
    # 无替换地(without replacement)采样 foreground 区域
    if fg_inds.size > 0:
        fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)

    # 选择 background RoIs, overlap 在 [BG_THRESH_LO, BG_THRESH_HI) 之间的
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO) )[0]
    # 计算从图片中选择的 background RoIs 数量
    # (避免数量太少)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # 无替换地(without replacement)采样 background 区域
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)

    # 所选择的 indices (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)
    # Label 是与每个 RoI 具有最大 overlap 的类别class
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]

    if 'bbox_targets' not in roidb:
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        gt_boxes = roidb['boxes'][gt_inds, :]
        gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
        bbox_targets = _compute_targets(sampled_boxes, 
                                        gt_boxes[gt_assignments, :], 
                                        sampled_labels)
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
    else:
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(roidb['bbox_targets'][keep_inds, :])

    bbox_outside_weights = np.array(bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)

    # 缩放Scale rois,并格式化为: (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    # Base Fast R-CNN blobs
    blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False),
                     rois=sampled_rois,
                     bbox_targets=bbox_targets,
                     bbox_inside_weights=bbox_inside_weights,
                     bbox_outside_weights=bbox_outside_weights )

    # Optionally add Mask R-CNN blobs
    if cfg.MODEL.MASK_ON:
        roi_data.mask_rcnn.add_mask_rcnn_blobs(
            blob_dict, sampled_boxes, roidb, im_scale, batch_idx )

    # Optionally add Keypoint R-CNN blobs
    if cfg.MODEL.KEYPOINTS_ON:
        roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(
            blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx)

    return blob_dict


def _compute_targets(ex_rois, gt_rois, labels):
    """
    计算图片的边界框回归目标值bounding-box regression targets.
    """

    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 4

    targets = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS)
    return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False )


def _expand_bbox_targets(bbox_target_data):
    """
    边界框回归目标值以紧凑形式存储在 roidb 中.
    该函数将 targets 展开为网所使用的 4-of-4*K 表示.
    (i.e. 只有一个类别class 具有 non-zero targets). 
    类似地,loss weights 也进行展开.

    返回值:
        bbox_target_data (ndarray): N x 4K blob of regression targets
        bbox_inside_weights (ndarray): N x 4K blob of loss weights
    """
    num_bbox_reg_classes = cfg.MODEL.NUM_CLASSES
    if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG:
        num_bbox_reg_classes = 2  # bg and fg

    clss = bbox_target_data[:, 0]
    bbox_targets = blob_utils.zeros((clss.size, 4 * num_bbox_reg_classes))
    bbox_inside_weights = blob_utils.zeros(bbox_targets.shape)
    inds = np.where(clss > 0)[0]
    for ind in inds:
        cls = int(clss[ind])
        start = 4 * cls
        end = start + 4
        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
        bbox_inside_weights[ind, start:end] = (1.0, 1.0, 1.0, 1.0)
    return bbox_targets, bbox_inside_weights


def _add_multilevel_rois(blobs):
    """
    默认情况,只对单 feature map level 添加训练 RoIs.
    当使用 FPN时,RoIs 必须根据 level 设置启发式来分配到不同的 FPN levels.
    (参见: modeling.FPN.map_rois_to_fpn_levels).
    """
    lvl_min = cfg.FPN.ROI_MIN_LEVEL
    lvl_max = cfg.FPN.ROI_MAX_LEVEL

    def _distribute_rois_over_fpn_levels(rois_blob_name):
        """
        分配 rois 到不同的 FPN levels.
        """
        # 获取每个 roi 的 target level
        # blob rois 格式为:(batch_idx, x1, y1, x2, y2), 因此,取1:5 列的 box 坐标
        target_lvls = fpn.map_rois_to_fpn_levels(blobs[rois_blob_name][:, 1:5], 
                                                 lvl_min, lvl_max )
        # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl>
        fpn.add_multilevel_roi_blobs(blobs, 
                                     rois_blob_name, 
                                     blobs[rois_blob_name], 
                                     target_lvls, 
                                     lvl_min,
                                     lvl_max)

    _distribute_rois_over_fpn_levels('rois')
    if cfg.MODEL.MASK_ON:
        _distribute_rois_over_fpn_levels('mask_rois')
    if cfg.MODEL.KEYPOINTS_ON:
        _distribute_rois_over_fpn_levels('keypoint_rois')

2. mask_rcnn.py

构建 Mask R-CNN 训练的 minibatches.

"""
处理 Mask R-CNN 的 minibatch blobs.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import numpy as np

from core.config import cfg
import utils.blob as blob_utils
import utils.boxes as box_utils
import utils.segms as segm_utils

logger = logging.getLogger(__name__)


def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx):
    """
    添加 Mask R-CNN 特有的 blobs 到 input blob dictionary.
    """
    """
    准备 mask targets:
    将一个 gt mask 关联到每个具有 fg 类别标签(non-bg class label)的训练 roi,
    """
    M = cfg.MRCNN.RESOLUTION
    polys_gt_inds = np.where((roidb['gt_classes'] > 0) 
                             & (roidb['is_crowd'] == 0))[0]
    polys_gt = [roidb['segms'][i] for i in polys_gt_inds]
    boxes_from_polys = segm_utils.polys_to_boxes(polys_gt)
    fg_inds = np.where(blobs['labels_int32'] > 0)[0]
    roi_has_mask = blobs['labels_int32'].copy()
    roi_has_mask[roi_has_mask > 0] = 1

    if fg_inds.shape[0] > 0:
        # foreground rois 的类别标签
        mask_class_labels = blobs['labels_int32'][fg_inds]
        masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True)

        # 寻找所有的 foreground rois 与边界框之间的重叠区域,封闭区域.
        rois_fg = sampled_boxes[fg_inds]
        overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
            rois_fg.astype(np.float32, copy=False),
            boxes_from_polys.astype(np.float32, copy=False) )

        # 将每个 fg rois 映射到 highest overlap 的mask.
        # (衡量标准: bbox overlap)
        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)

        # 添加 fg targets
        for i in range(rois_fg.shape[0]):
            fg_polys_ind = fg_polys_inds[i]
            poly_gt = polys_gt[fg_polys_ind]
            roi_fg = rois_fg[i]

            # 将给定 fg roi 中的多边形 mask 转换为 MxM 的二值图像.
            mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M)
            # 确保 mask 是二值的binary
            mask = np.array(mask > 0, dtype=np.int32)  
            masks[i, :] = np.reshape(mask, M**2)
    else:  # 如果没有 fg masks
        # 网络不能处理空 blobs,因此,需要提供一个 mask.
        # 简单采用第一个 bg roi,并给定其一个都是 -1(ignore label) 值的 mask,
        # 且其类别标签为 0 (bg).
        bg_inds = np.where(blobs['labels_int32'] == 0)[0]
        # rois_fg 实际上是一个 background roi, but that's ok because ...
        rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
        # 设定一个 -1's blob (ignore label)
        masks = -blob_utils.ones((1, M**2), int32=True)
        # 设定其类别标签 class = 0 (background)
        mask_class_labels = blob_utils.zeros((1, ))
        # 确保第一个 roi 有一个 mask
        roi_has_mask[0] = 1

    if cfg.MRCNN.CLS_SPECIFIC_MASK:
        masks = _expand_to_class_specific_mask_targets(masks,
                                                       mask_class_labels)

    # 缩放Scale rois_fg,并格式化为: (batch_idx, x1, y1, x2, y2)
    rois_fg *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1))
    rois_fg = np.hstack((repeated_batch_idx, rois_fg))

    # Update blobs dict with Mask R-CNN blobs
    blobs['mask_rois'] = rois_fg
    blobs['roi_has_mask_int32'] = roi_has_mask
    blobs['masks_int32'] = masks


def _expand_to_class_specific_mask_targets(masks, mask_class_labels):
    """
    将 masks 由 shape (#masks, M ** 2) 展开到 (#masks, #classes * M ** 2),
    以表示类别已知的 mask targets.
    """
    assert masks.shape[0] == mask_class_labels.shape[0]
    M = cfg.MRCNN.RESOLUTION

    # Target values of -1 are "don't care" / ignore labels
    mask_targets = -blob_utils.ones((masks.shape[0], 
                                     cfg.MODEL.NUM_CLASSES * M**2), 
                                    int32=True )

    for i in range(masks.shape[0]):
        cls = int(mask_class_labels[i])
        start = M**2 * cls
        end = start + M**2
        # 忽略 background 实例instance
        # (只有图片中没有 fg 样本是才会发生)
        if cls > 0:
            mask_targets[i, start:end] = masks[i, :]

    return mask_targets

3. keypoint_rcnn.py

构建 Mask R-CNN 关于 keypoints 训练的 minibatches.

"""
处理 Mask R-CNN 中关于 keypoint 检测分支训练的 minibatch blobs.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import numpy as np

from core.config import cfg
import utils.blob as blob_utils
import utils.keypoints as keypoint_utils

logger = logging.getLogger(__name__)


def add_keypoint_rcnn_blobs(blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx):
    """
    添加 Mask R-CNN keypoint 相关的 blobs 到给定的 blobs dictionary.
    """
    """
    注: gt_inds 必须与 datasets.json_dataset._merge_proposal_boxes_into_roidb 中的计算一致.
    """
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]
    max_overlaps = roidb['max_overlaps']
    gt_keypoints = roidb['gt_keypoints']

    ind_kp = gt_inds[roidb['box_to_gt_ind_map']]
    within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes'])
    vis_kp = gt_keypoints[ind_kp, 2, :] > 0
    is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0
    kp_fg_inds = np.where(np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, 
                                         is_visible) )[0]

    kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size)
    if kp_fg_inds.size > kp_fg_rois_per_this_image:
        kp_fg_inds = np.random.choice(kp_fg_inds, 
                                      size=kp_fg_rois_per_this_image, 
                                      replace=False )

    sampled_fg_rois = roidb['boxes'][kp_fg_inds]
    box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds]

    num_keypoints = gt_keypoints.shape[2]
    sampled_keypoints = -np.ones((len(sampled_fg_rois), 
                                  gt_keypoints.shape[1], num_keypoints),
                                 dtype=gt_keypoints.dtype )
    for ii in range(len(sampled_fg_rois)):
        ind = box_to_gt_ind_map[ii]
        if ind >= 0:
            sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :]
            assert np.sum(sampled_keypoints[ii, 2, :]) > 0

    heats, weights = keypoint_utils.keypoints_to_heatmap_labels(
        sampled_keypoints, sampled_fg_rois )

    shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1)
    heats = heats.reshape(shape)
    weights = weights.reshape(shape)

    sampled_fg_rois *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_fg_rois.shape[0], 1) )
    sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois))

    blobs['keypoint_rois'] = sampled_fg_rois
    blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False)
    blobs['keypoint_weights'] = weights


def finalize_keypoint_minibatch(blobs, valid):
    """
    当所有的 minibatch 图片 blobs 处理完以后,定型 minibatch.
    """
    min_count = cfg.KRCNN.MIN_KEYPOINT_COUNT_FOR_VALID_MINIBATCH
    num_visible_keypoints = np.sum(blobs['keypoint_weights'])
    valid = (valid and len(blobs['keypoint_weights']) > 0 and
        num_visible_keypoints > min_count )
    # Normalizer to use if cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS is False.
    # See modeling.model_builder.add_keypoint_losses
    norm = num_visible_keypoints / (
        cfg.TRAIN.IMS_PER_BATCH * cfg.TRAIN.BATCH_SIZE_PER_IM *
        cfg.TRAIN.FG_FRACTION * cfg.KRCNN.NUM_KEYPOINTS
    )
    blobs['keypoint_loss_normalizer'] = np.array(norm, dtype=np.float32)
    return valid


def _within_box(points, boxes):
    """
    确认在给定 box 中的 keypoints.

    points: Nx2xK
    boxes: Nx4
    output: NxK
    """
    x_within = np.logical_and(
        points[:, 0, :] >= np.expand_dims(boxes[:, 0], axis=1),
        points[:, 0, :] <= np.expand_dims(boxes[:, 2], axis=1) )
    y_within = np.logical_and(
        points[:, 1, :] >= np.expand_dims(boxes[:, 1], axis=1),
        points[:, 1, :] <= np.expand_dims(boxes[:, 3], axis=1) )
    return np.logical_and(x_within, y_within)

4. rpn.py

"""
RPN - Region Proposal Networks 构建 minibatch.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import numpy as np
import numpy.random as npr

from core.config import cfg
import roi_data.data_utils as data_utils
import utils.blob as blob_utils
import utils.boxes as box_utils

logger = logging.getLogger(__name__)


def get_rpn_blob_names(is_training=True):
    """
    RPN 使用的 Blob names.
    """
    # im_info: (height, width, image scale)
    blob_names = ['im_info']
    if is_training:
        # gt boxes: (batch_idx, x1, y1, x2, y2, cls)
        blob_names += ['roidb']
        if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
            # 与 RPN blobs 格式一致, but one per FPN level
            for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1):
                blob_names += ['rpn_labels_int32_wide_fpn' + str(lvl),
                               'rpn_bbox_targets_wide_fpn' + str(lvl),
                               'rpn_bbox_inside_weights_wide_fpn' + str(lvl),
                               'rpn_bbox_outside_weights_wide_fpn' + str(lvl) ]
        else:
            # Single level RPN blobs
            blob_names += ['rpn_labels_int32_wide',
                           'rpn_bbox_targets_wide',
                           'rpn_bbox_inside_weights_wide',
                           'rpn_bbox_outside_weights_wide' ]
    return blob_names


def add_rpn_blobs(blobs, im_scales, roidb):
    """
    添加 RPN-only 和 end-to-end Faster R-CNN 模型训练所需的 blobs.
    """
    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
        # RPN applied to many feature levels, as in the FPN paper
        k_max = cfg.FPN.RPN_MAX_LEVEL
        k_min = cfg.FPN.RPN_MIN_LEVEL
        foas = []
        for lvl in range(k_min, k_max + 1):
            field_stride = 2.**lvl
            anchor_sizes = (cfg.FPN.RPN_ANCHOR_START_SIZE * 2.**(lvl - k_min), )
            anchor_aspect_ratios = cfg.FPN.RPN_ASPECT_RATIOS
            foa = data_utils.get_field_of_anchors(field_stride, anchor_sizes, anchor_aspect_ratios)
            foas.append(foa)
        all_anchors = np.concatenate([f.field_of_anchors for f in foas])
    else:
        foa = data_utils.get_field_of_anchors(cfg.RPN.STRIDE, 
                                              cfg.RPN.SIZES, 
                                              cfg.RPN.ASPECT_RATIOS )
        all_anchors = foa.field_of_anchors

    for im_i, entry in enumerate(roidb):
        scale = im_scales[im_i]
        im_height = np.round(entry['height'] * scale)
        im_width = np.round(entry['width'] * scale)
        gt_inds = np.where( (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0) )[0]
        gt_rois = entry['boxes'][gt_inds, :] * scale
        # 待办事项(rbg): gt_boxes is poorly named;
        # should be something like 'gt_rois_info'
        gt_boxes = blob_utils.zeros((len(gt_inds), 6))
        gt_boxes[:, 0] = im_i  # batch inds
        gt_boxes[:, 1:5] = gt_rois
        gt_boxes[:, 5] = entry['gt_classes'][gt_inds]
        im_info = np.array([[im_height, im_width, scale]], dtype=np.float32)
        blobs['im_info'].append(im_info)

        # 添加 RPN targets
        if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
            # RPN applied to many feature levels, as in the FPN paper
            rpn_blobs = _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_rois)
            for i, lvl in enumerate(range(k_min, k_max + 1)):
                for k, v in rpn_blobs[i].items():
                    blobs[k + '_fpn' + str(lvl)].append(v)
        else:
            # 经典 RPN, 对单 feature level 应用.
            rpn_blobs = _get_rpn_blobs(im_height, im_width, [foa], all_anchors, gt_rois)
            for k, v in rpn_blobs.items():
                blobs[k].append(v)

    for k, v in blobs.items():
        if isinstance(v, list) and len(v) > 0:
            blobs[k] = np.concatenate(v)

    valid_keys = ['has_visible_keypoints', 'boxes', 'segms', 'seg_areas', 'gt_classes',
                  'gt_overlaps', 'is_crowd', 'box_to_gt_ind_map', 'gt_keypoints' ]
    minimal_roidb = [{} for _ in range(len(roidb))]
    for i, e in enumerate(roidb):
        for k in valid_keys:
            if k in e:
                minimal_roidb[i][k] = e[k]
    blobs['roidb'] = blob_utils.serialize(minimal_roidb)

    # Always return valid=True, since RPN minibatches are valid by design
    return True


def _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_boxes):
    total_anchors = all_anchors.shape[0]
    straddle_thresh = cfg.TRAIN.RPN_STRADDLE_THRESH

    if straddle_thresh >= 0:
        # 只保留在图片内的 anchors,根据阈值 straddle_thresh
        # 设置 TRAIN.RPN_STRADDLE_THRESH = -1 (或一个很大的值) 以保留所有的 anchors.
        inds_inside = np.where((all_anchors[:, 0] >= -straddle_thresh) &
                               (all_anchors[:, 1] >= -straddle_thresh) &
                               (all_anchors[:, 2] < im_width + straddle_thresh) &
                               (all_anchors[:, 3] < im_height + straddle_thresh) )[0]
        # keep only inside anchors
        anchors = all_anchors[inds_inside, :]
    else:
        inds_inside = np.arange(all_anchors.shape[0])
        anchors = all_anchors
    num_inside = len(inds_inside)

    logger.debug('total_anchors: {}'.format(total_anchors))
    logger.debug('inds_inside: {}'.format(num_inside))
    logger.debug('anchors.shape: {}'.format(anchors.shape))

    # 计算 anchor labels:
    # label=1 is positive, 0 is negative, -1 is don't care (ignore)
    labels = np.empty((num_inside, ), dtype=np.int32)
    labels.fill(-1)
    if len(gt_boxes) > 0:
        # 计算 anchors 与 gt boxes 重叠区域间的 overlaps
        anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes)
        # 映射 anchor 到具有 highest overlap 的 gt box
        anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
        # 对于每个 anchor, 与最重叠的 gt box 的 overlap 数量
        anchor_to_gt_max = anchor_by_gt_overlap[np.arange(num_inside),
                                                anchor_to_gt_argmax]

        # 将 gt box映射到具有 highest overlap 的 anchor
        gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
        #对于每个 gt box, 与最重叠的 anchor 的 overlap 数量
        gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax,
                                                np.arange(anchor_by_gt_overlap.shape[1]) ]
        # 寻找共享 max overlap 数量的所有 anchors
        # (this includes many ties)
        anchors_with_max_overlap = np.where(anchor_by_gt_overlap == gt_to_anchor_max)[0]

        # Fg label: for each gt use anchors with highest overlap
        # (including ties)
        labels[anchors_with_max_overlap] = 1
        # Fg label: 大于 IOU 阈值
        labels[anchor_to_gt_max >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

    # 如果有很多 positive labels, 则随机采样
    num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCH_SIZE_PER_IM)
    fg_inds = np.where(labels == 1)[0]
    if len(fg_inds) > num_fg:
        disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False)
        labels[disable_inds] = -1
    fg_inds = np.where(labels == 1)[0]

    # 如果有很多 negative labels,则随机采样
    # (samples with replacement, but since the set of bg inds is large most
    # samples will not have repeats)
    num_bg = cfg.TRAIN.RPN_BATCH_SIZE_PER_IM - np.sum(labels == 1)
    bg_inds = np.where(anchor_to_gt_max < cfg.TRAIN.RPN_NEGATIVE_OVERLAP)[0]
    if len(bg_inds) > num_bg:
        enable_inds = bg_inds[npr.randint(len(bg_inds), size=num_bg)]
        labels[enable_inds] = 0
    bg_inds = np.where(labels == 0)[0]

    bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
    bbox_targets[fg_inds, :] = data_utils.compute_targets(anchors[fg_inds, :],
                                                          gt_boxes[anchor_to_gt_argmax[fg_inds], :] )

    """
    Bbox regression loss 的形式:
       loss(x) = weight_outside * L(weight_inside * x)
    Inside weights 可以在 element-wist basis 上设为 0.
    bbox regression 只对 positive 样本进行训练,因此可以设置其权重为 1.0,否则设为 0.0
    Inside weights 相当于 "开关".
    """
    bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32)
    bbox_inside_weights[labels == 1, :] = (1.0, 1.0, 1.0, 1.0)

    """
    bbox regression loss 只根据 minibatch 内的图片数进行取平均.
    根据所选取的 anchors 样本总数进行取平均.

    Outside weights 用于对每个 loss 逐元素缩放(scale each element-wise loss),
    因此,最终的对 minibatch 求平均是正确的.
    Outside weights 相当于 "权重".
    """
    bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32)
    # uniform weighting of examples (given non-uniform sampling)
    num_examples = np.sum(labels >= 0)
    bbox_outside_weights[labels == 1, :] = 1.0 / num_examples
    bbox_outside_weights[labels == 0, :] = 1.0 / num_examples

    # Map up to original set of anchors
    labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1)
    bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0)
    bbox_inside_weights = data_utils.unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
    bbox_outside_weights = data_utils.unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)

    # 对生成的 labels, etc. 分割为 labels per each field of anchors
    blobs_out = []
    start_idx = 0
    for foa in foas:
        H = foa.field_size
        W = foa.field_size
        A = foa.num_cell_anchors
        end_idx = start_idx + H * W * A
        _labels = labels[start_idx:end_idx]
        _bbox_targets = bbox_targets[start_idx:end_idx, :]
        _bbox_inside_weights = bbox_inside_weights[start_idx:end_idx, :]
        _bbox_outside_weights = bbox_outside_weights[start_idx:end_idx, :]
        start_idx = end_idx

        # 输出 labels 的 shape (1, A, height, width)
        _labels = _labels.reshape((1, H, W, A)).transpose(0, 3, 1, 2)
        # bbox_targets 输出的 shape (1, 4 * A, height, width)
        _bbox_targets = _bbox_targets.reshape( (1, H, W, A * 4)).transpose(0, 3, 1, 2)
        # bbox_inside_weights 输出的 shape (1, 4 * A, height, width)
        _bbox_inside_weights = _bbox_inside_weights.reshape((1, H, W, A * 4)).transpose(0, 3, 1, 2)
        # bbox_outside_weights 输出的 shape (1, 4 * A, height, width)
        _bbox_outside_weights = _bbox_outside_weights.reshape( (1, H, W, A * 4)).transpose(0, 3, 1, 2)
        blobs_out.append(dict(rpn_labels_int32_wide=_labels,
                              rpn_bbox_targets_wide=_bbox_targets,
                              rpn_bbox_inside_weights_wide=_bbox_inside_weights,
                              rpn_bbox_outside_weights_wide=_bbox_outside_weights) )

    return blobs_out[0] if len(blobs_out) == 1 else blobs_out

5. retinanet.py

"""
计算训练 RetinaNet 网络的 minibatch blobs.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import logging

import utils.boxes as box_utils
import roi_data.data_utils as data_utils
from core.config import cfg


logger = logging.getLogger(__name__)


def get_retinanet_blob_names(is_training=True):
    """
    返回 blob names,以 data loader 读取的顺序.

    N = number of images per minibatch
    A = number of anchors = num_scales * num_aspect_ratios (for example 9 used in RetinaNet paper)
    H, W = spatial dimensions (different for each FPN level)
    M = Out of all the anchors generated, 取决于 positive/negative IoU overlap thresholds, 
        会得到 M 个 positive anchors. 
        这些是 bounding box 网络分支来回归的 anchors.

    retnet_cls_labels -> labels for the cls branch for each FPN level
                         Shape: N x A x H x W

    retnet_roi_bbox_targets -> targets for the bbox regression branch
                               Shape: M x 4

    retnet_roi_fg_bbox_locs -> bbox 回归时,由于只对 fg bboxes 进行回归,
                             且,网络的预测输出的shape 是 N x (A * 4) x H x W ,
                             因此,将 positive boxes 的位置存储在 retnet_roi_fg_bbox_locs blobs,
                              其shape 为  M x 4,每一行的元素为:[img_id, anchor_id, x_loc, y_loc]
    """
    # im_info: (height, width, image scale)
    blob_names = ['im_info']
    assert cfg.FPN.FPN_ON, "RetinaNet uses FPN for dense detection"
    # Same format as RPN blobs, but one per FPN level
    if is_training:
        blob_names += ['retnet_fg_num', 'retnet_bg_num']
        for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1):
            suffix = 'fpn{}'.format(lvl)
            blob_names += ['retnet_cls_labels_' + suffix,
                           'retnet_roi_bbox_targets_' + suffix,
                           'retnet_roi_fg_bbox_locs_' + suffix, ]
    return blob_names


def add_retinanet_blobs(blobs, im_scales, roidb, image_width, image_height):
    """
    添加 RetinaNet blobs.
    """
    # RetinaNet is applied to many feature levels, as in the FPN paper
    k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
    scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE
    num_aspect_ratios = len(cfg.RETINANET.ASPECT_RATIOS)
    aspect_ratios = cfg.RETINANET.ASPECT_RATIOS
    anchor_scale = cfg.RETINANET.ANCHOR_SCALE

    # get anchors from all levels for all scales/aspect ratios
    foas = []
    for lvl in range(k_min, k_max + 1):
        stride = 2. ** lvl
        for octave in range(scales_per_octave):
            octave_scale = 2 ** (octave / float(scales_per_octave))
            for idx in range(num_aspect_ratios):
                anchor_sizes = (stride * octave_scale * anchor_scale, )
                anchor_aspect_ratios = (aspect_ratios[idx], )
                foa = data_utils.get_field_of_anchors(
                    stride, anchor_sizes, anchor_aspect_ratios, octave, idx)
                foas.append(foa)
    all_anchors = np.concatenate([f.field_of_anchors for f in foas])

    blobs['retnet_fg_num'], blobs['retnet_bg_num'] = 0.0, 0.0
    for im_i, entry in enumerate(roidb):
        scale = im_scales[im_i]
        im_height = np.round(entry['height'] * scale)
        im_width = np.round(entry['width'] * scale)
        gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0]
        assert len(gt_inds) > 0, 'Empty ground truth empty for image is not allowed. Please check.'

        gt_rois = entry['boxes'][gt_inds, :] * scale
        gt_classes = entry['gt_classes'][gt_inds]

        im_info = np.array([[im_height, im_width, scale]], dtype=np.float32)
        blobs['im_info'].append(im_info)

        retinanet_blobs, fg_num, bg_num = _get_retinanet_blobs(
            foas, all_anchors, gt_rois, gt_classes, image_width, image_height)
        for i, foa in enumerate(foas):
            for k, v in retinanet_blobs[i].items():
                # the way it stacks is:
                # [[anchors for image1] + [anchors for images 2]]
                level = int(np.log2(foa.stride))
                key = '{}_fpn{}'.format(k, level)
                if k == 'retnet_roi_fg_bbox_locs':
                    v[:, 0] = im_i
                    # loc_stride: 80 * 4 if cls_specific else 4
                    loc_stride = 4  # 4 coordinate corresponding to bbox prediction
                    if cfg.RETINANET.CLASS_SPECIFIC_BBOX:
                        loc_stride *= (cfg.MODEL.NUM_CLASSES - 1)
                    anchor_ind = foa.octave * num_aspect_ratios + foa.aspect
                    # v[:, 1] is the class label [range 0-80] if we do
                    # class-specfic bbox otherwise it is 0. In case of class
                    # specific, based on the label, the location of current
                    # anchor is class_label * 4 and then we take into account
                    # the anchor_ind if the anchors
                    v[:, 1] *= 4
                    v[:, 1] += loc_stride * anchor_ind
                blobs[key].append(v)
        blobs['retnet_fg_num'] += fg_num
        blobs['retnet_bg_num'] += bg_num

    blobs['retnet_fg_num'] = blobs['retnet_fg_num'].astype(np.float32)
    blobs['retnet_bg_num'] = blobs['retnet_bg_num'].astype(np.float32)

    N = len(roidb)
    for k, v in blobs.items():
        if isinstance(v, list) and len(v) > 0:
            # compute number of anchors
            A = int(len(v) / N)
            # for the cls branch labels [per fpn level],
            # we have blobs['retnet_cls_labels_fpn{}'] as a list until this step
            # and length of this list is N x A where
            # N = num_images, A = num_anchors for example, N = 2, A = 9
            # Each element of the list has the shape 1 x 1 x H x W where H, W are
            # spatial dimension of curret fpn lvl. Let a{i} denote the element
            # corresponding to anchor i [9 anchors total] in the list.
            # The elements in the list are in order [[a0, ..., a9], [a0, ..., a9]]
            # however the network will make predictions like 2 x (9 * 80) x H x W
            # so we first concatenate the elements of each image to a numpy array
            # and then concatenate the two images to get the 2 x 9 x H x W

            if k.find('retnet_cls_labels') >= 0:
                tmp = []
                # concat anchors within an image
                for i in range(0, len(v), A):
                    tmp.append(np.concatenate(v[i: i + A], axis=1))
                # concat images
                blobs[k] = np.concatenate(tmp, axis=0)
            else:
                # for the bbox branch elements [per FPN level],
                #  we have the targets and the fg boxes locations
                # in the shape: M x 4 where M is the number of fg locations in a
                # given image at the current FPN level. For the given level,
                # the bbox predictions will be. The elements in the list are in
                # order [[a0, ..., a9], [a0, ..., a9]]
                # Concatenate them to form M x 4
                blobs[k] = np.concatenate(v, axis=0)
    return True


def _get_retinanet_blobs(
        foas, all_anchors, gt_boxes, gt_classes, im_width, im_height):
    total_anchors = all_anchors.shape[0]
    logger.debug('Getting mad blobs: im_height {} im_width: {}'.format(
        im_height, im_width))

    inds_inside = np.arange(all_anchors.shape[0])
    anchors = all_anchors
    num_inside = len(inds_inside)

    logger.debug('total_anchors: {}'.format(total_anchors))
    logger.debug('inds_inside: {}'.format(num_inside))
    logger.debug('anchors.shape: {}'.format(anchors.shape))

    # Compute anchor labels:
    # label=1 is positive, 0 is negative, -1 is don't care (ignore)
    labels = np.empty((num_inside, ), dtype=np.float32)
    labels.fill(-1)
    if len(gt_boxes) > 0:
        # Compute overlaps between the anchors and the gt boxes overlaps
        anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes)
        # Map from anchor to gt box that has highest overlap
        anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
        # For each anchor, amount of overlap with most overlapping gt box
        anchor_to_gt_max = anchor_by_gt_overlap[
            np.arange(num_inside), anchor_to_gt_argmax]

        # Map from gt box to an anchor that has highest overlap
        gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
        # For each gt box, amount of overlap with most overlapping anchor
        gt_to_anchor_max = anchor_by_gt_overlap[
            gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])]
        # Find all anchors that share the max overlap amount
        # (this includes many ties)
        anchors_with_max_overlap = np.where(
            anchor_by_gt_overlap == gt_to_anchor_max)[0]

        # Fg label: for each gt use anchors with highest overlap
        # (including ties)
        gt_inds = anchor_to_gt_argmax[anchors_with_max_overlap]
        labels[anchors_with_max_overlap] = gt_classes[gt_inds]
        # Fg label: above threshold IOU
        inds = anchor_to_gt_max >= cfg.RETINANET.POSITIVE_OVERLAP
        gt_inds = anchor_to_gt_argmax[inds]
        labels[inds] = gt_classes[gt_inds]

    fg_inds = np.where(labels >= 1)[0]
    bg_inds = np.where(anchor_to_gt_max < cfg.RETINANET.NEGATIVE_OVERLAP)[0]
    labels[bg_inds] = 0
    num_fg, num_bg = len(fg_inds), len(bg_inds)

    bbox_targets = np.zeros((num_inside, 4), dtype=np.float32)
    bbox_targets[fg_inds, :] = data_utils.compute_targets(
        anchors[fg_inds, :], gt_boxes[anchor_to_gt_argmax[fg_inds], :])

    # Map up to original set of anchors
    labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1)
    bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0)

    # Split the generated labels, etc. into labels per each field of anchors
    blobs_out = []
    start_idx = 0
    for foa in foas:
        H = foa.field_size
        W = foa.field_size
        end_idx = start_idx + H * W
        _labels = labels[start_idx:end_idx]
        _bbox_targets = bbox_targets[start_idx:end_idx, :]
        start_idx = end_idx

        # labels output with shape (1, height, width)
        _labels = _labels.reshape((1, 1, H, W))
        # bbox_targets output with shape (1, 4 * A, height, width)
        _bbox_targets = _bbox_targets.reshape((1, H, W, 4)).transpose(0, 3, 1, 2)
        stride = foa.stride
        w = int(im_width / stride)
        h = int(im_height / stride)

        # data for select_smooth_l1 loss
        num_classes = cfg.MODEL.NUM_CLASSES - 1
        inds_4d = np.where(_labels > 0)
        M = len(inds_4d)
        _roi_bbox_targets = np.zeros((0, 4))
        _roi_fg_bbox_locs = np.zeros((0, 4))
        if M > 0:
            im_inds, y, x = inds_4d[0], inds_4d[2], inds_4d[3]
            _roi_bbox_targets = np.zeros((len(im_inds), 4))
            _roi_fg_bbox_locs = np.zeros((len(im_inds), 4))
            lbls = _labels[im_inds, :, y, x]
            for i, lbl in enumerate(lbls):
                l = lbl[0] - 1
                if not cfg.RETINANET.CLASS_SPECIFIC_BBOX:
                    l = 0
                assert l >= 0 and l < num_classes, 'label out of the range'
                _roi_bbox_targets[i, :] = _bbox_targets[:, :, y[i], x[i]]
                _roi_fg_bbox_locs[i, :] = np.array([[0, l, y[i], x[i]]])
        blobs_out.append(dict(retnet_cls_labels=_labels[:, :, 0:h, 0:w].astype(np.int32),
                              retnet_roi_bbox_targets=_roi_bbox_targets.astype(np.float32),
                              retnet_roi_fg_bbox_locs=_roi_fg_bbox_locs.astype(np.float32), ) )
    out_num_fg = np.array([num_fg + 1.0], dtype=np.float32)
    out_num_bg = (np.array([num_bg + 1.0]) * (cfg.MODEL.NUM_CLASSES - 1) +
                  out_num_fg * (cfg.MODEL.NUM_CLASSES - 2))

    return blobs_out, out_num_fg, out_num_bg
本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2018年04月09日,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • Caffe2 - (三十二) Detectron 之 roi_data - 模型 minibatch blobs
    • 1. fast_rcnn.py
      • 2. mask_rcnn.py
        • 3. keypoint_rcnn.py
          • 4. rpn.py
            • 5. retinanet.py
            领券
            问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档