提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
文章目录
需求
删除文件夹中的相同图片,保留一个即可,还希望它运行的很快,最好是多线程
直接上代码(有注释)
python
# coding=utf-8
import os
import shutil
from loguru import logger
import threading
def match_img(img1,img2):
# 比较图片相似度函数 值为1.0代表两图相同
# 计算单通道的直方图的相似值 图片相似度比较
import cv2
def calculate(image1, image2):
import cv2
hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0])
hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0])
# 计算直方图的重合度
degree = 0
for i in range(len(hist1)):
if hist1[i] != hist2[i]:
degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))
else:
degree = degree + 1
degree = degree / len(hist1)
return degree
# 通过得到RGB每个通道的直方图来计算相似度
def classify_hist_with_split(image1, image2, size=(256, 256)):
# 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值
import cv2
image1 = cv2.resize(image1, size)
image2 = cv2.resize(image2, size)
sub_image1 = cv2.split(image1)
sub_image2 = cv2.split(image2)
sub_data = 0
for im1, im2 in zip(sub_image1, sub_image2):
sub_data += calculate(im1, im2)
sub_data = sub_data / 3
return sub_data
n = classify_hist_with_split(cv2.imread(img1), cv2.imread(img2))
if n == 1.0:
return n
return n[0]
def copy_image(source_path, destination_path):
shutil.copy(source_path, destination_path)
class MatchImg(object):
def __init__(self):
self.thread_lock = threading.Lock()
def split_list(self,my_list):
# 5000图片为一组,一组为一个线程
chunk_size = 5000
chunks = [my_list[i:i + chunk_size] for i in range(0, len(my_list), chunk_size)]
return chunks
def spilt_img(self,img_list,n):
# 一个列表,桉顺序取出一个图片跟之后图片一一比较的方式 方法比较笨
num = len(img_list)
for i in range(num):
logger.debug('第{}线程 第{}轮'.format(n,i))
img = img_list[i]
other_imgs = img_list[i + 1:]
if other_imgs and os.path.exists('ct_imgs/' + img):
self.for_img(img, other_imgs,n)
def for_img(self,img,img_list,num):
img_path = 'ct_imgs/'+img
if os.path.exists(img_path):
for i in img_list:
i_img = 'ct_imgs/' + i
if os.path.exists(i_img):
res = match_img(img_path,i_img)
if res == 1.0:
self.thread_lock.acquire()
logger.debug('第{}线程 删除图片路径 {}'.format(num,i_img))
os.remove(i_img)
self.thread_lock.release()
def run(self):
folder_path = 'ct_imgs' # 目标文件夹路径
img_list = os.listdir(folder_path)
print(len(img_list))
pool = []
split_list = self.split_list(img_list)
for i in range(len(split_list)):
t = threading.Thread(target=self.spilt_img,args=(split_list[i],i))
t.start()
pool.append(t)
for j in pool:
j.join()
if __name__ == '__main__':
obj = MatchImg()
obj.run()
总结
本代码对比算法比较中庸,欢迎指出更好的对比算法,该程序面对数据多的图片文件根据实际情况需要多次运行处理,因为这个是组内图片比较。