文章目录[隐藏]
【COCO数据集】获取并保存特定的某一类(例如“person”类)
1. 开始直接找到相应的github中保存COCO特定类别的开源轮子:
2. 存在问题:
save_imgs
函数中requests.get(im['coco_url']).content
是直接从COCO的URL网络链接中下载这张图片,速度很慢,需要几个小时;
采用“科学上网”依旧下载速度很慢。
3. 解决办法:
利用迅雷
等手段快速的将整个COCO数据集从官网下载到本地,利用opencv
来从本地直接读取和复制图片,速度大大增加,修改本来轮子后的python代码如下:
from pycocotools.coco import COCO
import requests
import os
from os.path import join
from tqdm import tqdm
import json
import cv2
import numpy as np
class coco_category_filter:
"""
Downloads images of one category & filters jsons
to only keep annotations of this category
"""
def __init__(self, json_path, imgs_dir, categ='person'):
self.coco = COCO(json_path) # instanciate coco class
self.json_path = json_path
self.imgs_dir = imgs_dir
self.categ = categ
self.images = self.get_imgs_from_json()
def get_imgs_from_json(self):
"""returns image names of the desired category"""
# instantiate COCO specifying the annotations json path
# Specify a list of category names of interest
catIds = self.coco.getCatIds(catNms=[self.categ])
print("catIds: ", catIds)
# Get the corresponding image ids and images using loadImgs
imgIds = self.coco.getImgIds(catIds=catIds)
images = self.coco.loadImgs(imgIds)
#print(f"{len(images)} images in '{self.json_path}' with '{self.categ}' instances")
self.catIds = catIds # list
#print(images)
return images
def save_imgs(self, root_dir):
"""saves the images of this category"""
print("Saving the images with required categories ...")
os.makedirs(os.path.join(self.imgs_dir, subset+year), exist_ok=True)
# Save the images into a local folder
for im in tqdm(self.images):
img_data = cv2.imread(os.path.join(root_dir, subset+year, im['file_name']))
copy_img_data = np.zeros(img_data.shape, np.uint8)
copy_img_data = img_data.copy()
cv2.imwrite(os.path.join(self.imgs_dir, subset+year, im['file_name']), copy_img_data)
def filter_json_by_category(self, new_json_path):
"""creates a new json with the desired category"""
# {'supercategory': 'person', 'id': 1, 'name': 'person'}
### Filter images:
print("Filtering the annotations ... ")
json_parent = os.path.split(new_json_path)[0]
os.makedirs(json_parent, exist_ok=True)
imgs_ids = [x['id'] for x in self.images] # get img_ids of imgs with the category
new_imgs = [x for x in self.coco.dataset['images'] if x['id'] in imgs_ids]
catIds = self.catIds
### Filter annotations
new_annots = [x for x in self.coco.dataset['annotations'] if x['category_id'] in catIds]
### Reorganize the ids
new_imgs, annotations = self.modify_ids(new_imgs, new_annots)
### Filter categories
new_categories = [x for x in self.coco.dataset['categories'] if x['id'] in catIds]
print("new_categories: ", new_categories)
data = {
"info": self.coco.dataset['info'],
"licenses": self.coco.dataset['licenses'],
"images": new_imgs,
"annotations": new_annots,
"categories": new_categories
}
print("saving json: ")
with open(new_json_path, 'w') as f:
json.dump(data, f)
def modify_ids(self, images, annotations):
"""
creates new ids for the images. I.e., reorganizes the ids and returns the dictionaries back
images: list of images dictionaries
imId_counter: image id starting from one (each dicto will start with id of last json +1)
"""
print("Reinitialicing images and annotation IDs ...")
### Images
old_new_imgs_ids = {} # necessary for the annotations!
for n,im in enumerate(images):
old_new_imgs_ids[images[n]['id']] = n+1 # dicto with old im_ids and new im_ids
images[n]['id'] = n+1 # reorganize the ids
### Annotations
for n,ann in enumerate(annotations):
annotations[n]['id'] = n+1
old_image_id = annotations[n]['image_id']
annotations[n]['image_id'] = old_new_imgs_ids[old_image_id] # replace im_ids in the annotations as well
return images, annotations
def main(subset, year, root_dir, put_dir, category='cell phone'):
json_file = join(os.path.split(root_dir)[0],'annotations', 'instances_'+subset+year+'.json') # local path
imgs_dir = join(put_dir, category + '_' + subset)
new_json_file = join(put_dir, 'annotations', subset+".json")
coco_filter = coco_category_filter(json_file, imgs_dir, categ=category) # instanciate class
coco_filter.save_imgs(root_dir)
coco_filter.filter_json_by_category(new_json_file)
if __name__ == '__main__':
subset, year='val', '2017' # val - train
root_dir = '/下载到本地的COCO数据集的绝对路径/COCO/'
put_dir = '/某一特定类别的图片和annotations的保存路径/COCO_phone/'
main(subset, year, root_dir, put_dir, category='cell phone')
注意:一定要记得修改训练时会用到的类别文件,如xxxx.pbtxt,如未修改会出现验证集mAP为0.00000的现象。
版权声明:本文为CSDN博主「frootguo」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_43348528/article/details/122497906
暂无评论