介绍

在进行目标检测任务时，背景通常会成为影响我们识别的准确性，例如，模型有时候会把背景识别成某件物体、两个物体相互遮挡掉一部分而识别错误等，并且卷积神经神经网络的旋转适应性有局限，缺乏显示的旋转不变性机制，也就是说，在物体出现旋转的摆放情况下，通常会变得难以识别、置信度低下等问题，如果你试过市面上的旋转增强方法，你会发现它们都会有bbox莫名其妙变大的bug，这个是无法避免的，只有已知物体轮廓的情况下旋转才能保证bbox仍然是外接矩形，因此需要提出一种数据集增强的方法，向模型提供一个物体在不同背景下的表现情况，以及物体在不同旋转角情况下的状态。

在参加2023年工训实践大赛期间，我们发现我们的目标检测模型通常在背景不同的时候出现很高的误检率，由于我们机械结构的设计使置物平台的背景并不是纯色，如图所示的螺钉，会干扰我们的识别。例如，有时候我们的电池所在的bbox区域内刚好出现这些螺钉时，会出现不识别的情况。

并且也出现了这样一种问题，即：当一个电池或者矿泉水瓶进行旋转，或是矿泉水瓶的一部分超出了识别区域，模型就完全无法检出这个物体。因此我提出了下面这种数据增强方法，即通过SAM模型先制作出少量的语义分割数据集，通过将其随机旋转后与随机的背景于随机的位置结合，得到大量的目标检测数据集，最终经过我们的比赛实践等，我们只需要经过少量的标注即可取得非常好的识别效果，超越了大部分人的检测效果，取得了安徽省一等奖和国家级二等奖。

增强效果图如下

不过，在国赛的时候，主办方给我们小组的垃圾形态比较特殊，例如红色的小米电池、胡萝卜条、红色砖块等，使我们的识别模型旋转。大量地将电池误识别为胡萝卜而遗憾落败，无法进入一等奖的行列。当时许多队伍也因此吃亏，因此衍生出了许多有趣的梗图。

数据集制作

安装anylabeling

1 2	conda create -n anylabeling python=3.8 anaconda conda activate anylabeling

CPU：

1	pip install anylabeling

GPU:

1	pip install anylabeling-gpu

安装完毕后使用指令运行

1	anylabeling

在你下次需要执行的时候，你只需要这样做

1 2	conda activate anylabeling anylabeling

准备好你要标注的图片文件夹，点击这里选择你的图片文件夹

然后点击这个大脑，开启SAM标注

选择一个你想要的模型，模型是自动从网上下载的，如果失败请使用特殊方法下载（本文不介绍）

然后点+Point按钮，点击物体即可

没问题就点击finish，如果有问题，可以用-Point在误标记的地方点击，它会自动重新计算。

写入你希望的名称，同一个物体的名称必须相同

完成所有标记后，请你所有标签和图片将会存放在同一个文件夹内，像我这样子

然后，像我这样准备几张空白的背景图，背景图请尽可能制造一些差异。

代码

这几个函数是我编写的图像处理工具函数，无脑复制就可以了，请安装opencv、numpy、matplotlib、pandas库，具体安装方法我不赘述。

import json
import cv2
import numpy as np
from matplotlib import pyplot as plt
import random
import os
import pandas as pd
import shutil

def cut(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(gray, 20, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    max_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(max_contour)
    #cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cut = image[y:y+h,x:x+w]
    return cut,[x,y,w,h]


def rotate(image,angle):
    center = (image.shape[1]/2, image.shape[0]/2)
    #scale = (image.shape[1] / float(image.shape[1] + image.shape[0]))

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(gray, 20, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    max_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(max_contour)

    origin_corner_position = [[0, 0], [0, h], [w, 0], [h, w]]

    
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1)

    for index, point in enumerate(origin_corner_position):
        origin_corner_position[index] = np.matmul(rotation_matrix, np.array([point[0], point[1], 1]).T)
 
    boundingBox = cv2.boundingRect(np.array(origin_corner_position, dtype=np.int32))
    rotation_matrix[0][2] += (boundingBox[2] - w)/2
    rotation_matrix[1][2] += (boundingBox[3] - h) / 2
    
    rotated_image = cv2.warpAffine(image, rotation_matrix, (boundingBox[2], boundingBox[3]))

    gray = cv2.cvtColor(rotated_image, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(gray, 20, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    max_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(max_contour)

    return rotated_image,(x,y,w,h)


def insert(bkg,img):
    #img=remove_edge_blackness(img)
    #img[img<40]=0
    #img = cv2.GaussianBlur(img, (3, 3), 1)
    
    x = random.randint(0,bkg.shape[1]-img.shape[1])
    y = random.randint(0,bkg.shape[0]-img.shape[0])
    w = img.shape[1]
    h = img.shape[0]

    mb = img[:,:,0]
    mg = img[:,:,1]
    mr = img[:,:,2]
    img[np.bitwise_and(np.bitwise_and(mb<30, mg<30), mr<30)]=0
    
    bkg[y:y+h,x:x+w][img!=0]=0
    bkg[y:y+h,x:x+w]+=img
    

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(gray, 20, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    max_contour = max(contours, key=cv2.contourArea)
    xx, yy, w, h = cv2.boundingRect(max_contour)
    xx+=x
    yy+=y
    return bkg,(xx,yy,w,h)


class Item():
    def __init__(self,label,img,pts):
        self.label=label
        pts=np.array(pts,int)
        mask = np.zeros(img.shape)
        mask=cv2.fillPoly(mask, [pts], (255, 255, 255))
        img[mask==0]=0
        self.cut=cut(img)[0]
        
    def get(self):
        image,_=rotate(self.cut,random.randint(0,180))
        return image
    
    
def delete_folder(path):
    try:
        shutil.rmtree(path)
        print(f"成功删除文件夹 {path}")
    except OSError as e:
        print(f"删除文件夹 {path} 失败: {e}")


def loadLabel(f):
    with open(f, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


def checkIOU(bboxes, threshold=0.5):
    for i in range(len(bboxes)):
        bbox_i = bboxes[i]
        for j in range(i + 1, len(bboxes)):
            bbox_j = bboxes[j]
            
            # 计算两个边界框的坐标
            x1_i, y1_i, w_i, h_i = bbox_i
            x2_i, y2_i = x1_i + w_i, y1_i + h_i
            
            x1_j, y1_j, w_j, h_j = bbox_j
            x2_j, y2_j = x1_j + w_j, y1_j + h_j
            
            # 计算交集的坐标
            x1_intersection = max(x1_i, x1_j)
            y1_intersection = max(y1_i, y1_j)
            x2_intersection = min(x2_i, x2_j)
            y2_intersection = min(y2_i, y2_j)

            minArea=min(w_i*h_i,w_j*h_j)
            # 计算交集的面积
            intersection_area = max(0, x2_intersection - x1_intersection) * max(0, y2_intersection - y1_intersection)
            
            iou = intersection_area/minArea
            if iou > threshold:
                return True
    
    return False


def getNames(everythings):
    return [i.label for i in everythings]


def saveLabel(bboxs,types,filename):
    inttype = [dics[i] for i in types]
    bboxdf = pd.DataFrame(bboxs,columns=["x","y","w","h"])
    bboxdf["cx"]=bboxdf["x"]+(bboxdf["w"]/2)
    bboxdf["cy"]=bboxdf["y"]+(bboxdf["h"]/2)
    bboxdf["cx"]/=bkg.shape[1]
    bboxdf["w"]/=bkg.shape[1]
    
    bboxdf["cy"]/=bkg.shape[0]
    bboxdf["h"]/=bkg.shape[0]
    bboxdf["t"]=inttype
    bboxdf.loc[:,["t","cx","cy","w","h"],]
    with open("{}.txt".format(filename),"w+") as f:
        for i in np.array(bboxdf.loc[:,["t","cx","cy","w","h"],]):
            f.write("{:d} {} {} {} {}\n".format(int(i[0]),i[1],i[2],i[3],i[4]))
            
        
def resize_image(image, max_size):
    
    # 获取原始图像的尺寸
    height, width = image.shape[:2]
    
    # 如果图像尺寸小于等于最大尺寸，则直接返回原始图像
    if height <= max_size and width <= max_size:
        return image
    
    # 计算缩放比例
    if height > width:
        scale = max_size / height
    else:
        scale = max_size / width
    
    # 缩放图像
    new_height = int(height * scale)
    new_width = int(width * scale)
    resized_image = cv2.resize(image, (new_width, new_height))
    
    return resized_image


def chooseTypeIndices(everythings,dic):
    t=random.choice(list(dic.keys()))
    return [i for i, item in enumerate(getNames(everythings)) if item == t  ]

def chooseEqually(everythings,dic):
    idx = chooseTypeIndices(everythings,dic)
    return random.choice(idx)

着手编写你自己的数据集信息

dics={
    'battery': 0,
    'bottle': 1,
    'brick': 2,
    'can': 3,
    'carrot': 4,
    'glass': 5,
    'medicine': 6,
    'mooli': 7,
    'package': 8,
    'pebble': 9,
    'potato': 10,
}

上面代码中，提到的字典内的内容，替换成你需要识别的物品的类别，需要从0开始递增，目的是为了生成YOLO格式的数据集。

接下来，我这里创建几个目录来存放生成的数据集，你可以照着我的代码继续进行

delete_folder("bboxupload")
os.mkdir("bboxupload")
os.mkdir("bboxupload/label")
os.mkdir("bboxupload/img")

接下来，读取你的背景图片和标签信息。

path2labels="lbs-1" #你的标签所在路径
path2imgs="lbs-1"   #你的图片所在路径
path2bkgs='bkgs'  #你的背景图片所在路径
# 下面这些操作是删除.ipynb_checkpoints，这是Jupyter Notebook生成的文件，但是在本项目中毫无作用，会干扰读取，因此删除。
delete_folder(path2labels+"/.ipynb_checkpoints")
delete_folder(path2imgs+"/.ipynb_checkpoints")
delete_folder(path2bkgs+"/.ipynb_checkpoints")

bkgs = []
for i in os.listdir(path2bkgs): 
    bkgs.append(cv2.imread(path2bkgs+"/"+i))
    
everythings = []

alllabels = list(set([i.split('.')[0] for i in os.listdir(path2labels)]))

for lab in alllabels:
    lab+=".json"

    data = loadLabel(path2labels+'/'+lab)
    ig = cv2.imread(path2imgs+'/'+data['imagePath'])
    print(path2imgs+data['imagePath'])
    for i in data["shapes"]:
        everythings.append( Item(i["label"],np.array(ig),i["points"]) )

着手生成

counts=3 # 你每张图片希望出现的物体个数

indices = [i for i, item in enumerate(getNames(everythings)) if item in list(dics.keys())  ]


for mj in range(100):
    while True:
        #print(mj)
        bboxs = []
        types=[]
        bkg=np.array(random.choice(bkgs))
        for i in range(counts):
			# chooseEqually旨在让每个被抽取的物品的类别平衡，避免出现例如塑料瓶的数量远大于电池的数量的情况，而random.choice可能会出现这种情况。
            i=chooseEqually(everythings,dics) #random.choice(indices)
            merge,(x,y,w,h)=insert(bkg,resize_image(everythings[i].get(),1024))
            bboxs.append([x,y,w,h])
            types.append(everythings[i].label)

        for i in bboxs:
            x,y,w,h=i
            #cv2.rectangle(merge, (x, y), (x + w, y + h), ( random.randint(0,255), random.randint(0,255), random.randint(0,255)), 4)
        #print(checkIOU(bboxs))
		# 这里是为了检查是否有东西被盖住了，如果有被盖住了就得重新生成，如果符合条件就保存。
        if not checkIOU(bboxs):
            saveLabel(bboxs,types,"bboxupload/label/167Q1ERM"+str(mj))
            cv2.imwrite("bboxupload/img/167Q1ERM{}.jpg".format(mj),merge)
            break