一、cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml

_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
  MASK_ON: True
  WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
  RESNETS:
#resnetX的配置
    STRIDE_IN_1X1: False  # this is a C2 model
    NUM_GROUPS: 32
    WIDTH_PER_GROUP: 8
#resnet的深度
    DEPTH: 152
#旋转卷积的开启
    DEFORM_ON_PER_STAGE: [False, True, True, True]
  ROI_HEADS:
    NAME: "CascadeROIHeads"
  ROI_BOX_HEAD:
    NAME: "FastRCNNConvFCHead"
    NUM_CONV: 4
    NUM_FC: 1
#使用GN在FastRCNNConvFCHead
    NORM: "GN"
    CLS_AGNOSTIC_BBOX_REG: True
  ROI_MASK_HEAD:
    NUM_CONV: 8
#使用GN在ROI_MASK_HEAD
    NORM: "GN"
  RPN:
    POST_NMS_TOPK_TRAIN: 2000
SOLVER:
  IMS_PER_BATCH: 128
  STEPS: (35000, 45000)
  MAX_ITER: 50000
  BASE_LR: 0.16
INPUT:
  MIN_SIZE_TRAIN: (640, 864)
#range为短边随机resize到(640, 864)内，同时长边不超过MAX_SIZE_TRAIN，
#若超过则为MAX_SIZE_TRAIN且按比例缩小为MIN_SIZE_TRAIN。
#choice为随机选择(640, 864)中的一个
  MIN_SIZE_TRAIN_SAMPLING: "range"
  MAX_SIZE_TRAIN: 1440
  CROP:
    ENABLED: True
TEST:
  EVAL_PERIOD: 2500

二、Base-RCNN-FPN.yaml

MODEL:
#总体结构组成
  META_ARCHITECTURE: "GeneralizedRCNN"
  #骨干网络选择
  BACKBONE:
    NAME: "build_resnet_fpn_backbone"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  ANCHOR_GENERATOR:
  #anchor的大小和缩放比例
    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
  RPN:
    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
    #选取的RPN框个数
    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
    # Detectron1 uses 2000 proposals per-batch,
    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
    POST_NMS_TOPK_TRAIN: 1000
    POST_NMS_TOPK_TEST: 1000
#ROI头的总体结构，IN_FEATURES为box，mask共用。
  ROI_HEADS:
    NAME: "StandardROIHeads"
    IN_FEATURES: ["p2", "p3", "p4", "p5"]
#ROI下box的子头
  ROI_BOX_HEAD:
    NAME: "FastRCNNConvFCHead"
    NUM_FC: 2
    POOLER_RESOLUTION: 7
#ROI下mask的子头
  ROI_MASK_HEAD:
    NAME: "MaskRCNNConvUpsampleHead"
    NUM_CONV: 4
    POOLER_RESOLUTION: 14
DATASETS:
  TRAIN: ("coco_2017_train",)
  TEST: ("coco_2017_val",)
SOLVER:
  IMS_PER_BATCH: 16
  BASE_LR: 0.02
  STEPS: (60000, 80000)
  MAX_ITER: 90000
INPUT:
  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

三、defaults.py

部分常用的配置参数

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .config import CfgNode as CN

_C = CN()

# The version number, to upgrade from old configs to new ones if any
# changes happen. It's recommended to keep a VERSION in your config file.
_C.VERSION = 2

_C.MODEL = CN()
#mask的开启
_C.MODEL.MASK_ON = False

#模型加载的路径
_C.MODEL.WEIGHTS = ""

# -----------------------------------------------------------------------------
# INPUT
# -----------------------------------------------------------------------------
_C.INPUT = CN()
# Size of the smallest side of the image during training
_C.INPUT.MIN_SIZE_TRAIN = (800,)
# Sample size of smallest side by choice or random selection from range give by
# INPUT.MIN_SIZE_TRAIN
_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
# Maximum size of the side of the image during training
_C.INPUT.MAX_SIZE_TRAIN = 1333
# Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
_C.INPUT.MIN_SIZE_TEST = 800
# Maximum size of the side of the image during testing
_C.INPUT.MAX_SIZE_TEST = 1333
# Mode for flipping images used in data augmentation during training
# choose one of ["horizontal, "vertical", "none"]
_C.INPUT.RANDOM_FLIP = "horizontal"

# `True` if cropping is used for data augmentation during training
_C.INPUT.CROP = CN({"ENABLED": False})

#if CROP.TYPE is "relative" or "relative_range" 
#裁减的范围(0, 1]
_C.INPUT.CROP.SIZE = [0.1, 0.5]

# -----------------------------------------------------------------------------
# DataLoader
# -----------------------------------------------------------------------------
# Number of data loading threads
_C.DATALOADER.NUM_WORKERS = 4

# Tf True, when working on datasets that have instance annotations, the
# training dataloader will filter out images without associated annotations
_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True

# ---------------------------------------------------------------------------- #
# FPN options
# ---------------------------------------------------------------------------- #
_C.MODEL.FPN = CN()

# Options: "" (no norm), "GN"
_C.MODEL.FPN.NORM = ""

# ---------------------------------------------------------------------------- #
# Anchor generator options
# ---------------------------------------------------------------------------- #
_C.MODEL.ANCHOR_GENERATOR = CN()
# Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input.
# Format: list

]. SIZES[i] specifies the list of sizes
# to use for IN_FEATURES[i]; len(SIZES) == len(IN_FEATURES) must be true,
# or len(SIZES) == 1 is true and size list SIZES[0] is used for all
# IN_FEATURES.
_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
# Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect
# ratios are generated by an anchor generator.
# Format: list

]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W)
# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
# for all IN_FEATURES.
_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
# Anchor angles.
# list

], the angle in degrees, for each input feature map.
# ANGLES[i] specifies the list of angles for IN_FEATURES[i].
_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]


# ---------------------------------------------------------------------------- #
# ROI HEADS options
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_HEADS = CN()
# Number of foreground classes
#除去背景的类别数
_C.MODEL.ROI_HEADS.NUM_CLASSES = 80

# ---------------------------------------------------------------------------- #
# Box Head
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_BOX_HEAD = CN()
# Options: "" (no norm), "GN", "SyncBN".
_C.MODEL.ROI_BOX_HEAD.NORM = ""

# ---------------------------------------------------------------------------- #
# Mask Head
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_MASK_HEAD = CN()
# Normalization method for the convolution layers.
# Options: "" (no norm), "GN", "SyncBN".
_C.MODEL.ROI_MASK_HEAD.NORM = ""

# ---------------------------------------------------------------------------- #
# RetinaNet Head
# ---------------------------------------------------------------------------- #
_C.MODEL.RETINANET = CN()

# This is the number of foreground classes.
_C.MODEL.RETINANET.NUM_CLASSES = 80

_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]

# Convolutions to use in the cls and bbox tower
# NOTE: this doesn't include the last conv for logits
_C.MODEL.RETINANET.NUM_CONVS = 4

# IoU overlap ratio [bg, fg] for labeling anchors.
# Anchors with < bg are labeled negative (0)
# Anchors  with >= bg and < fg are ignored (-1)
# Anchors with >= fg are labeled positive (1)
_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]

# Prior prob for rare case (i.e. foreground) at the beginning of training.
# This is used to set the bias for the logits layer of the classifier subnet.
# This improves training stability in the case of heavy class imbalance.
_C.MODEL.RETINANET.PRIOR_PROB = 0.01

# Inference cls score threshold, only anchors with score > INFERENCE_TH are
# considered for inference (to improve speed)
_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
# Select topk candidates before NMS
_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5

# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)

# Loss parameters
_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
# Options are: "smooth_l1", "giou"
_C.MODEL.RETINANET.BBOX_REG_LOSS_TYPE = "smooth_l1"

# One of BN, SyncBN, FrozenBN, GN
# Only supports GN until unshared norm is implemented
_C.MODEL.RETINANET.NORM = ""

# ---------------------------------------------------------------------------- #
# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
# Note that parts of a resnet may be used for both the backbone and the head
# These options apply to both
# ---------------------------------------------------------------------------- #
_C.MODEL.RESNETS = CN()
#RESNETS的网络深度
_C.MODEL.RESNETS.DEPTH = 50

# Options: FrozenBN, GN, "SyncBN", "BN"
_C.MODEL.RESNETS.NORM = "FrozenBN"

# Apply Deformable Convolution in stages
# Specify if apply deform_conv on Res2, Res3, Res4, Res5
_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]

# ---------------------------------------------------------------------------- #
# Solver
# ---------------------------------------------------------------------------- #
_C.SOLVER = CN()

# 学习率策略
_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
#最大迭代次数
_C.SOLVER.MAX_ITER = 40000
#基础学习率
_C.SOLVER.BASE_LR = 0.001
#动量
_C.SOLVER.MOMENTUM = 0.9

_C.SOLVER.NESTEROV = False

_C.SOLVER.WEIGHT_DECAY = 0.0001
# The weight decay that's applied to parameters of normalization layers
# (typically the affine transformation)
_C.SOLVER.WEIGHT_DECAY_NORM = 0.0
#学习率衰减比率
_C.SOLVER.GAMMA = 0.1
# 到制定的step时进行一次衰减
_C.SOLVER.STEPS = (30000,)
#预热
_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000
#从WARMUP_FACTOR到BASE_LR的step间隔
_C.SOLVER.WARMUP_ITERS = 1000

_C.SOLVER.WARMUP_METHOD = "linear"

# 保存模型间隔
_C.SOLVER.CHECKPOINT_PERIOD = 5000

# batch大小
_C.SOLVER.IMS_PER_BATCH = 16

# Enable automatic mixed precision for training
# Note that this does not change model's inference behavior.
# To use AMP in inference, run inference under autocast()
_C.SOLVER.AMP = CN({"ENABLED": False})

# ---------------------------------------------------------------------------- #
# Specific test options
# ---------------------------------------------------------------------------- #
_C.TEST = CN()
# The period (in terms of steps) to evaluate the model during training.
# Set to 0 to disable.
#间隔多少step进行测试
_C.TEST.EVAL_PERIOD = 0

#每张图片最大检出目标数
_C.TEST.DETECTIONS_PER_IMAGE = 100

# ---------------------------------------------------------------------------- #
# Misc options
# ---------------------------------------------------------------------------- #
#输出目录
_C.OUTPUT_DIR = "./output"

# Benchmark different cudnn algorithms.
# If input images have very different sizes, this option will have large overhead
# for about 10k iterations. It usually hurts total time, but can benefit for certain models.
# If input images have the same or similar sizes, benchmark is often helpful.
_C.CUDNN_BENCHMARK = False