Dataturks 导出 Retail Product Dataset.json
从 Retail Product Dataset.json 中抽离 Retail Cigar Dataset.json,并做以下操作:
- cp 新图片 到 205 路径
- line 更新 content 路径
def filter_cigars_to_json():
old_dir = '/nfs/xs/retail'
new_dir = '/nfs/xs/docker/vipaturks'
old_projectId = '2c9180826d47a650016d5e359eaf0004' # Retail Products Dataset
new_projectId = '2c9180836ea666e4016ea8019d5e0001' # CigarSeg
in_path = os.path.join('data', 'Retail Products Dataset.json')
out_path = os.path.join('data', 'Retail Cigar Dataset.json')
with open(out_path, 'w', encoding='utf-8') as fw:
with open(in_path, 'r', encoding='utf-8') as fr:
lines = fr.readlines()
print('total img:', len(lines))
cigar_cnt = 0
cp_cnt = 0
for line in lines:
product_dict = json.loads(line)
# judge if img has been evaluated CORRECT
if product_dict['metadata']['evaluation'] != 'CORRECT':
continue
# judge if annotation is empty
anns = product_dict['annotation']
if anns is None:
continue
has_cigar = False
for ann in anns:
cat = ann['label'][0]
# 要设置结尾匹配,不然 BIG_ROLL 这样也会匹配到
if re.match('^.+_[A-Z]$', cat) or re.match('^.+_[a-z]$', cat):
has_cigar = True
cigar_cnt += 1
break
if has_cigar:
# 1.cp img from old_path(207) to new_path(205)
old_content = product_dict['content']
new_content = old_content.replace(old_projectId, new_projectId)
old_img_path = old_dir + old_content # path '/uploads', can't directly use os.path.join
new_img_path = new_dir + new_content
if not os.path.exists(new_img_path):
shutil.copyfile(old_img_path, new_img_path)
cp_cnt += 1
print('\rcp', cp_cnt, end='') # print basename so slow!
# 2.update line content, and write to json
line = line.replace(old_content, new_content) # if str(dict) directly, ' may not work
fw.write(line)
print('\ncigar img:', cigar_cnt)
- 构建数据集,注意修改 dataset_utils.py 中
convert_to_coco()
,不同任务,转化的 box 不同
def build_top_k_dataset(dataset, top_k=None):
""" create dataset with top_k classes """
filted_cats, filted_cats_num, train_num, val_num, test_num = split_and_save_coco_dataset(dataset, dataset_dir=dataset_dir, top_k=top_k)
prefix = '{}_'.format(top_k) if top_k else ''
data_cfg = {
'name': 'Cigar Rotated Box',
'cats_num': filted_cats_num,
'classes': len(filted_cats),
'train': train_num,
'valid': val_num,
'test': test_num
}
dump_json(data_cfg, out_path=os.path.join(dataset_dir, prefix + '{}_cfg.json'.format(dataset_name)))
def build_rbox_dataset():
""" create mulit rbox dataset with multi top_k classes """
in_path = os.path.join('data', 'Retail Cigar Dataset.json')
dataset = create_dataset_from_dataturks_json(dataturks_json_path=in_path)
# build with top k classes
top_ks = 20, 40
for tk in top_ks:
build_top_k_dataset(dataset, top_k=tk)
# build with all classes
build_top_k_dataset(dataset)
- 如果想在 Dataturks 中更新 CigarSeg,可以
- 清空数据集
def delete_d_hits_by_name(project_name):
sql = "delete from d_hits where projectId='{}'".format(query_projectId_by_name(project_name))
db.session.execute(sql)
db.session.commit()
- 重新上传
Retail Cigar Dataset.json
至此,Cigar 数据迁移完毕。