文章目录[隐藏]
一、算法原理
接受N级score,bbox_pred,anchor和image_shape作为输入,通过anchor和框的偏移(bbox_pred)得到proposal,然后对这些proposal做NMS,最后选出前num个。
二、执行步骤
- 将每级score,bbox_pred,anchor按照score从大到小排序,并选择前nms_pre个(一般为1000),共N*nms_pre个。
- 通过anchor和框的偏移(bbox_pred)得到proposal
- 去除框大小为负数的框,并且对于每级的proposal,加上一个足够大的offset,使得每级的框之间不会有重叠,将多分类NMS转成单分类NMS
- 将N级score和proposal整合在一起,按照score从大到小排序
- 做NMS
- 取前num个,并且给proposal减去之前加上的offset
三、python源码解析
#路径:mmdetection/mmdet/models/dense_heads/cascade_rpn_head.py:StageCascadeRPNHead::_get_bboxes_single
level_ids = []
mlvl_scores = []
mlvl_bbox_preds = []
mlvl_valid_anchors = []
for idx in range(len(cls_scores)): #len(cls_scores)表示是N级cascade
rpn_cls_score = cls_scores[idx] #每级的score
rpn_bbox_pred = bbox_preds[idx] #每级的bbox_preds
assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
#每级score的shape是(num_anchors * num_classes, H, W),bbox_preds的shape是(num_anchors * 4, H, W)
rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
if self.use_sigmoid_cls: #对score做二分类,用sigmoid
rpn_cls_score = rpn_cls_score.reshape(-1)
scores = rpn_cls_score.sigmoid()
else: #对score做二分类,用softmax
rpn_cls_score = rpn_cls_score.reshape(-1, 2)
# We set FG labels to [0, num_class-1] and BG label to
# num_class in RPN head since mmdet v2.5, which is unified to
# be consistent with other head since mmdet v2.0. In mmdet v2.0
# to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
scores = rpn_cls_score.softmax(dim=1)[:, 0]
rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
anchors = mlvl_anchors[idx]
if 0 < nms_pre < scores.shape[0]:
# sort is faster than topk
# _, topk_inds = scores.topk(cfg.nms_pre)
ranked_scores, rank_inds = scores.sort(descending=True)
topk_inds = rank_inds[:nms_pre]
scores = ranked_scores[:nms_pre]
rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
anchors = anchors[topk_inds, :]
mlvl_scores.append(scores)
mlvl_bbox_preds.append(rpn_bbox_pred)
mlvl_valid_anchors.append(anchors)
level_ids.append(
scores.new_full((scores.size(0), ), idx, dtype=torch.long))
scores = torch.cat(mlvl_scores)
anchors = torch.cat(mlvl_valid_anchors)
rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
proposals = self.bbox_coder.decode( #通过anchor和框的偏移(bbox_pred)得到proposal
anchors, rpn_bbox_pred, max_shape=img_shape)
ids = torch.cat(level_ids)
if cfg.min_bbox_size >= 0: #去除大小为负数的框
w = proposals[:, 2] - proposals[:, 0]
h = proposals[:, 3] - proposals[:, 1]
valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
if not valid_mask.all():
proposals = proposals[valid_mask]
scores = scores[valid_mask]
ids = ids[valid_mask]
#NMS操作
if proposals.numel() > 0:
dets, _ = batched_nms(proposals, scores, ids, cfg.nms)
else:
return proposals.new_zeros(0, 5)
#取前max_per_img个
return dets[:cfg.max_per_img]
四,cpu源码解析
float* score_ptr = new float[level*nms_pre];//level是级数,nms_pre是每级保留的框数
memset(score_ptr, 0, sizeof(float)*level*nms_pre);//有的级不足nms_pre个框,将多余的框的分数置零
float* score_sorted_ptr = new float[level*nms_pre];//排序后分数保存的地址
float* bbox_pred = new float[level*nms_pre*4]; //bbox_pred,每个坐标都对应一个,一个框有4个坐标
float* anchor = new float[level*nms_pre*4]; //anchor的坐标,一个框有4个
float* proposal_ptr = new float[level*nms_pre*4]; //偏移后proposal的坐标,一个框有4个
float* proposal_sorted_ptr = new float[level*nms_pre*4]; //排序后proposal的坐标
//step1 整合并排序N级score,bbox_pred和anchor
vector<thread> vec_thread;
for(int i=0; i<level; i++){
float* score = score_ptr+i*nms_pre;
float* bbox = bbox_pred+i*nms_pre*4;
float* anch = anchor+i*nms_pre*4;
vec_thread.push_back(thread(merge_input, i, nms_pre, score, bbox, anch));
}
for(int i=0; i<level; i++){
vec_thread[i].join();
}
void merge_input(int i, int nms_pre, ...){
const float* input_score = Input<Tensor>(i)->template Data<float>();
const float* input_bbox = Input<Tensor>(i+level)->template Data<float>();
const float* input_anchor = Input<Tensor>(i+level*2)->template Data<float>();
//对score进行排序,并且取前nms_pre个
vector<KeyValuePair> vec_node;
vec_node.resize(Input<Tensor>(i).Size());//排序所有的score
vector<int> sorted_id = SortedIdx(input_score, vec_node, nms_pre);
for(int i=0; i<nms_pre; i++){
score[i] = input_score[sorted_id[i]];
bbox[i*4] = input_bbox[sorted_id[i]*4];
bbox[i*4+1] = input_bbox[sorted_id[i]*4+1];
bbox[i*4+2] = input_bbox[sorted_id[i]*4+2];
bbox[i*4+3] = input_bbox[sorted_id[i]*4+3];
anch[i*4] = input_anchor[sorted_id[i]*4];
anch[i*4+1] = input_anchor[sorted_id[i]*4+1];
anch[i*4+2] = input_anchor[sorted_id[i]*4+2];
anch[i*4+3] = input_anchor[sorted_id[i]*4+3];
}
}
typedef struct{
float key;
int value;
}KeyValuePair;
bool compareNode(KeyValuePair node1, KeyValuePair node2) return node1.key>node2.key;
vector<int> SortedIdx(float* input_score, vector<KeyValuePair>& vec_node, int nms_pre){
for(int i=0; i<vec_node.size(); i++){
vec_node[i] = {input_score[i], i};
}
sort(vec_node.begin(), vec_node.end(), compareNode);//倒序排列,得到分数和对应的索引
vector<int> sorted_id(nms_pre);
for(int i=0; i<nms_pre; i++) sorted_id[i] = vec_node[i].value;
return sorted_id;
}
//step2 通过anchor和框的偏移(bbox_pred)得到proposal
float means[4] = {0,0,0,0};
float stds[4] = {1,1,1,1};
int num_boxes = level*nms_pre;
const int* image_shape = Input<Tensor>(15)->template Data<int>();
//参考python的delta2bbox实现,路径mmdetection/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py:delta2bbox
delta2bbox(bbox_pred, anchor, image_shape, means, stds, proposal_ptr, num_boxes);//注意这里的bbox_pred,anchor都不是整体有序的。只是局部有序
//step3 处理proposal坐标,去除无效值,添加offset,对整体score和proposal排序
for(int i=0; i<num_boxes; i++){
float w = proposal_ptr[i*4+2] - proposal_ptr[i*4];
float h = proposal_ptr[i*4+3] - proposal_ptr[i*4+1];
if(w<=min_w || h<=min_h){//min_w和min_h一般为0
proposal_ptr[i*4+2] = proposal_ptr[i*4] + 0.5;//变成小框
proposal_ptr[i*4+3] = proposal_ptr[i*4+1] + 0.5;
}
int level_id = i / nms_pre;//为每个level添加offset
proposal_ptr[i*4] += level_id*offset;
proposal_ptr[i*4+1] += level_id*offset;
proposal_ptr[i*4+2] += level_id*offset;
proposal_ptr[i*4+3] += level_id*offset;
}
vector<KeyValuePair> vec_node;
vec_node.resize(num_boxes);//排序所有的score
vector<int> sorted_id = SortedIdx(score_ptr, vec_node, num_boxes);
for(int i=0; i<num_boxes; i++){
score_sorted_ptr[i] = score_ptr[sorted_id[i]];
proposal_sorted_ptr[i*4] = proposal_ptr[sorted_id[i]*4];
proposal_sorted_ptr[i*4+1] = proposal_ptr[sorted_id[i]*4+1];
proposal_sorted_ptr[i*4+2] = proposal_ptr[sorted_id[i]*4+2];
proposal_sorted_ptr[i*4+3] = proposal_ptr[sorted_id[i]*4+3];
}
//step4 NMS,参考onnxruntime的cpu实现,onnxruntime/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
vector<int> selected_indices;//输出的结果,是score_sorted_ptr对应的索引
vector<BoxInfo> selected_boxes_inside_classes;
selected_boxes_inside_classes.reserve(num_boxes);
nms(selected_indices, selected_boxes_inside_classes, score_sorted_ptr, proposal_sorted_ptr);
inline void MaxMin(float lhs, float rhs, float& min, float& max){
if(lhs >= rhs){
min = rhs;
max = lhs;
}
else{
min = lhs;
max = rhs;
}
}
struct BoxInfo{
float score_{};
int index_{};
float box_[4]{};
BoxInfo() = default;
explicit BoxInfo(float score, int idx, int center_point_box, const float* box) : score_(score), index_(idx){
if(center_point_box == 0){\
//数据格式是[y1, x1, y2, x2]
MaxMin(box[1], box[3], box_[1], box_[3]);//将输入box中的小值给左边
MaxMin(box[0], box[2], box_[0], box_[2]);
}
else{
//数据格式是[x_center, y_center, width, height]
float box_width_half = box[2] / 2;
float box_height_half = box[3] / 2;
box_[1] = box[0] - box_width_half;
box_[3] = box[0] + box_width_half;
box_[0] = box[1] - box_height_half;
box_[2] = box[1] + box_height_half;
}
}
inline bool operator<(const BoxInfo* rhs) const {//返回分数大的,或者是索引小的
return score_ < rhs.score_ || (score_ == rhs.score_ && index_ > rhs.index_);
}
};
//step5 选择前num个框
Tensor *output = ctx->Output(0, {num, 5});//指定0号输出的尺寸是(num,5),num是框的数量,前四个是框的坐标,最后一个是框的得分
float* output_ptr = output->template MutableData<float>();//获取输出指针
for(int i=0; i<num; i++){
int index = selected_indices[i];
output_ptr[i*5] = proposal_sorted_ptr[index*4] - (int)(proposal_sorted_ptr[index*4] / offset) * offset;
output_ptr[i*5+1] = proposal_sorted_ptr[index*4+1] - (int)(proposal_sorted_ptr[index*4+1] / offset) * offset;
output_ptr[i*5+2] = proposal_sorted_ptr[index*4+2] - (int)(proposal_sorted_ptr[index*4+2] / offset) * offset;
output_ptr[i*5+3] = proposal_sorted_ptr[index*4+3] - (int)(proposal_sorted_ptr[index*4+3] / offset) * offset;
output_ptr[i*5+4] = score_sorted_ptr[index];
}
五、cuda源码解析
int threadPerBlock = 32*4;
int blockPerGrid = 0;
void* storage_ptr = nullptr;//保存临时空间的指针
size_t storage_bytes = 0;//保存临时空间的大小
//step1 整合并排序N级score,bbox_pred和anchor
for(int i=0; i<level; i++){
int score_num = Input<Tensor>(i).Size();
blockPerGrid = (score_num + threadPerBlock - 1) / threadPerBlock;
Iota<int><<<blockPerGrid, threadPerBlock, 0, stream>>>(score_id, 0, score_num);//创建score的索引
const float* input_score = Input<Tensor>(i)->template Data<float>();
const float* input_bbox = Input<Tensor>(i+level)->template Data<float>();
const float* input_anchor = Input<Tensor>(i+level*2)->template Data<float>();
cub::DoubleBuffer<float> d_keys(input_score, input_score_bak);//input_score_bak是和input_score大小相同的空间,用于双buffer
cub::DoubleBuffer<float> d_values(score_id, score_id_bak);
void* temp_storage_ptr = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num,
0, 8*sizeof(float), stream);//同步排序d_keys和d_values,但是当temp_storage_ptr为空时,只是算出temp_storage_bytes的大小,其余什么都不做
RE_CUDA_MALLOC(storage_ptr, storage_bytes, temp_storage_bytes, 1); //只有当前需要的临时空间比之前分配的大,才重新分配
cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num,
0, 8*sizeof(float), stream);
blockPerGrid = (nms_pre + threadPerBlock - 1) / threadPerBlock;
merge_input<<<blockPerGrid, threadPerBlock, 0, stream>>>(input_score_bak, score_ptr, input_bbox, bbox_pred, input_anchor, anchor, score_id, nms_pre, i);
}
template <typename T>
__global__ Iota(T* to_fill, const T offset, const int num){
for(int idx=blockIdx.x*blockDim.x+threadIdx.x; idx<num; idx+=blockDim.x*gridDim.x){
to_fill[idx] = static_cast<T>(idx) + offset;
}
}
#define RE_CUDA_MALLOC(ptr, pre_size, now_size, ele_size) \
if(pre_size<now_size){ \
if(ptr == nullptr) cudaMalloc(&ptr, now_size*ele_size); \
else{ \
cudaFree(ptr); \
cudaMalloc(ptr, now_size*ele_size); \
} \
pre_size = now_size; \
}
__global__ void merge_input(...){
int idx = blockIdx.x * blockDimDim.x + threadIdx.x;
if(idx < nms_pre){
int index = score_id[idx];
int dest = i * nms_pre + idx;
score_ptr[dest] = input_score_bak[idx];//input_score_bak已经是排好序的score
bbox_pred[dest*4] = input_bbox[index*4];
bbox_pred[dest*4+1] = input_bbox[index*4+1];
bbox_pred[dest*4+2] = input_bbox[index*4+2];
bbox_pred[dest*4+3] = input_bbox[index*4+3];
anchor[dest*4] = input_anchor[index*4];
anchor[dest*4+1] = input_anchor[index*4+1];
anchor[dest*4+2] = input_anchor[index*4+2];
anchor[dest*4+3] = input_anchor[index*4+3];
}
}
//step2 通过anchor和框的偏移(bbox_pred)得到proposal
float means[4] = {0,0,0,0};
float stds[4] = {1,1,1,1};
int num_boxes = level*nms_pre;
const int* image_shape = Input<Tensor>(15)->template Data<int>();
//参考python的delta2bbox实现,路径mmdetection/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py:delta2bbox
blockPerGrid = (num_boxes + threadPerBlock - 1) / threadPerBlock;
delta2bbox<<<blockPerGrid, threadPerBlock, 0, stream>>>(bbox_pred, anchor, image_shape, means, stds, proposal_ptr, num_boxes);
//step3 处理proposal坐标,去除无效值,添加offset,对整体score和proposal排序
valid_w_h<<<blockPerGrid, threadPerBlock, 0, stream>>>(proposal_ptr);
__global__ void valid_w_h(...){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i<num_boxes){
float w = proposal_ptr[i*4+2] - proposal_ptr[i*4];
float h = proposal_ptr[i*4+3] - proposal_ptr[i*4+1];
if(w<=min_w || h<=min_h){//min_w和min_h一般为0
proposal_ptr[i*4+2] = proposal_ptr[i*4] + 0.5;//变成小框
proposal_ptr[i*4+3] = proposal_ptr[i*4+1] + 0.5;
}
int level_id = i / nms_pre;//为每个level添加offset
proposal_ptr[i*4] += level_id*offset;
proposal_ptr[i*4+1] += level_id*offset;
proposal_ptr[i*4+2] += level_id*offset;
proposal_ptr[i*4+3] += level_id*offset;
}
}
//step4 NMS,参考onnxruntime的gpu实现,onnxruntime/onnxruntime/core/providers/gpu/object_detection/non_max_suppression.cc
struct __align__(16) Box{
float x1, y1, x2, y2;
};
cub::DoubleBuffer<float> d_keys(score_ptr, score_ptr_bak);
cub::DoubleBuffer<Box> d_values((Box*)proposal_ptr, (Box*)proposal_ptr_bak);
void* temp_storage_ptr = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num,
0, 8*sizeof(float), stream);//同步排序d_keys和d_values,但是当temp_storage_ptr为空时,只是算出temp_storage_bytes的大小,其余什么都不做
RE_CUDA_MALLOC(storage_ptr, storage_bytes, temp_storage_bytes, 1); //只有当前需要的临时空间比之前分配的大,才重新分配
cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num,
0, 8*sizeof(float), stream);
nms(stream, 0, score_ptr_bak, proposal_ptr_bak, proposal_sorted_ptr, selected_indices);
//step5 选择前num个框
blockPerGrid = (num + threadPerBlock - 1) / threadPerBlock;
select_top_n<<<blockPerGrid, threadPerBlock, 0, stream>>>(proposal_sorted_ptr, score_ptr_bak, output_ptr, selected_indices, num);
__global__ void select_top_n(...){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < num){
int index = selected_indices[i];
output_ptr[i*5] = proposal_sorted_ptr[index*4] - (int)(proposal_sorted_ptr[index*4] / offset) * offset;
output_ptr[i*5+1] = proposal_sorted_ptr[index*4+1] - (int)(proposal_sorted_ptr[index*4+1] / offset) * offset;
output_ptr[i*5+2] = proposal_sorted_ptr[index*4+2] - (int)(proposal_sorted_ptr[index*4+2] / offset) * offset;
output_ptr[i*5+3] = proposal_sorted_ptr[index*4+3] - (int)(proposal_sorted_ptr[index*4+3] / offset) * offset;
output_ptr[i*5+4] = score_ptr_bak[index];
}
}
版权声明:本文为CSDN博主「zhuikefeng」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/zhuikefeng/article/details/123135870
暂无评论