Python 删除重复图片(MD5 + pHash + ORB 图像识别)最终版:改目录直接运行(支持子文件夹/中文路径/导出清单/先移动再删)
很多人"删除重复图片"只会用 MD5:确实能删完全一样的文件,但遇到这些情况就失效:
-
同一张图被压缩过(清晰度变了)
-
分辨率不同(缩放过)
-
截图边缘裁了一点点(很像,但不是同文件)
所以这篇给你一套更稳、更接近"图像识别"的去重方案:
✅ MD5 :删"文件内容完全相同"的重复图(最快最准)
✅ pHash(感知哈希) :快速筛出"看起来很像"的候选
✅ ORB 图像识别(二次确认):关键点特征匹配,减少 pHash 误判(更像"识图")
并且脚本做到:
-
单文件:改目录即可跑
-
依赖写在 py 里:缺库自动 pip 安装(也支持设置国内镜像)
-
安全 :默认不直接删,先移动到
_duplicates/,并导出dedup_report.csv(可回溯) -
支持中文路径:Windows 下中文目录/中文文件名可读
1)使用方法(最简单)
第一步:保存脚本
把本文下面整段代码保存为:dedup_images_run.py
第二步:只改一个地方
打开文件,修改:
ROOT_DIR = r"D:\imgs"
改成你的图片目录。
第三步:直接运行
python dedup_images_run.py
2)脚本会输出什么?
运行结束后,你会在 ROOT_DIR 下看到:
-
dedup_report.csv:每一条重复判定记录(保留图、重复图、原因、距离/匹配度、移动到哪) -
_duplicates/:被判定为重复的图片(默认移动到这里,保持原目录结构)
确认无误后,你再决定是否要把 ACTION 改成 delete 直接删除。
3)阈值怎么调(最常见问题)
pHash 阈值 PHASH_TH
-
越小越严格(误判少,但漏判多)
-
常用范围:
6~10 -
推荐默认:
8
ORB 图像识别阈值(更关键)
ORB 用两个条件同时判断"足够相似":
-
ORB_MIN_GOOD_MATCHES:好匹配点数量(建议 15~60) -
ORB_SIMILARITY_TH:good_matches / min(kp1, kp2)的比例(建议 0.08~0.20)
如果你感觉重复图没抓到(太严格):
-
ORB_MIN_GOOD_MATCHES: 25 → 15 -
ORB_SIMILARITY_TH: 0.12 → 0.08
如果你感觉误判太多(太宽松):
-
ORB_MIN_GOOD_MATCHES: 25 → 40 -
ORB_SIMILARITY_TH: 0.12 → 0.18 -
同时把
PHASH_TH: 8 → 6
4)完整代码(单文件可运行,含自动安装依赖 + 图像识别 ORB)
说明:依赖安装"写在 py 里"并不等于不需要联网。第一次运行如果缺库,脚本会自动 pip 安装;如果机器完全离线,请提前装好依赖或把依赖 wheel 放到本地。
# -*- coding: utf-8 -*-
"""
dedup_images_run.py
Python 批量删除重复图片(MD5 + pHash + ORB 图像识别)------ 单文件可直接运行版
✅ 你只需要改 ROOT_DIR,然后 python 运行即可
✅ 不用手动 pip install:脚本会自动检测缺少的库并安装
✅ 支持子文件夹递归
✅ 支持中文路径(Windows)
✅ 默认不直接删除:先移动到 _duplicates/(更安全)
✅ 导出 dedup_report.csv,方便回溯
去重策略(推荐):
1) MD5:完全重复(内容一模一样)------最快最准
2) pHash:近似重复(缩放/压缩/轻微变化)------快速筛选候选
3) ORB 图像识别:关键点特征匹配------二次确认,减少误判,更像"识图"
依赖(自动安装):
- opencv-python(cv2)
- numpy
- tqdm
注意:
- 第一次自动安装 opencv-python 可能较慢
- 如在国内网络环境,可设置 PIP_INDEX_URL 为镜像
"""
import os
import sys
import csv
import time
import shutil
import hashlib
import subprocess
import importlib.util
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
# =========================================================
# 1) 配置区:你只需要改 ROOT_DIR
# =========================================================
ROOT_DIR = r"C:\feeday\imgTest" # <<< 改成你的图片目录
RECURSIVE = True # 是否递归子文件夹
# 模式:
# - "md5" 只做完全重复
# - "phash" 只做近似重复(可能误判多一点)
# - "md5+phash" 先md5后phash(推荐)
# - "md5+phash+orb" 先md5后phash,再用ORB图像识别二次确认(强烈推荐)
MODE = "md5+phash+orb"
PHASH_TH = 8 # pHash 汉明距离阈值:常用 6~10,默认 8
# ORB 图像识别参数(用于"确认相似")
USE_ORB = True # MODE 包含 orb 时会自动启用,这里留着方便你手动开关
ORB_NFEATURES = 1200 # ORB 特征点数量:800~2000 常用
ORB_RATIO_TEST = 0.75 # 0.70~0.80,越小越严格
ORB_MIN_GOOD_MATCHES = 25 # 至少多少个"好匹配"才算重复(建议 15~60)
ORB_SIMILARITY_TH = 0.12 # good_matches / min(kp1,kp2) 的比例阈值(建议 0.08~0.20)
ORB_MAX_IMAGE_EDGE = 1200 # 识别时把大图等比缩小到最长边(提速)
ACTION = "move" # "move" / "delete" / "none"
MOVE_TO = "_duplicates" # move 时重复图片存放目录(相对 ROOT_DIR)
REPORT_NAME = "dedup_report.csv" # 报告文件名(相对 ROOT_DIR)
DRY_RUN = False # True=只生成报告不移动/不删除
PREFER_KEEP = "area" # 保留策略: "area"(分辨率优先) / "size" / "newest" / "oldest"
# 自动安装依赖
AUTO_INSTALL = True
# 如在国内网络环境可设置镜像(可选)
# 例如:PIP_INDEX_URL = "https://mirrors.tencent.com/pypi/simple/"
PIP_INDEX_URL = ""
# =========================================================
# 2) 自动安装依赖(写在 py 里)
# =========================================================
def _module_exists(module_name: str) -> bool:
return importlib.util.find_spec(module_name) is not None
def ensure_packages():
"""
检测并自动安装依赖库:opencv-python / numpy / tqdm
"""
if not AUTO_INSTALL:
return
need_install = []
req = [
("cv2", "opencv-python"),
("numpy", "numpy"),
("tqdm", "tqdm"),
]
for mod, pkg in req:
if not _module_exists(mod):
need_install.append(pkg)
if not need_install:
return
print("⚠️ 检测到缺少依赖,将自动安装:", ", ".join(need_install))
cmd = [sys.executable, "-m", "pip", "install", "--upgrade"] + need_install
if PIP_INDEX_URL.strip():
cmd += ["-i", PIP_INDEX_URL.strip()]
print("➡️ 执行:", " ".join(cmd))
try:
subprocess.check_call(cmd)
print("✅ 依赖安装完成")
except subprocess.CalledProcessError as e:
print("❌ 自动安装失败:", e)
print("你可以手动执行:")
if PIP_INDEX_URL.strip():
print(f' {sys.executable} -m pip install -U {" ".join(need_install)} -i {PIP_INDEX_URL.strip()}')
else:
print(f' {sys.executable} -m pip install -U {" ".join(need_install)}')
sys.exit(1)
ensure_packages()
import numpy as np
import cv2
from tqdm import tqdm
# =========================================================
# 3) 主逻辑
# =========================================================
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tif", ".tiff"}
def imread_unicode(path: str) -> Optional[np.ndarray]:
"""
OpenCV 兼容中文路径读取
"""
try:
data = np.fromfile(path, dtype=np.uint8)
img = cv2.imdecode(data, cv2.IMREAD_COLOR)
return img
except Exception:
return None
def get_image_hw(path: str) -> Tuple[int, int]:
img = imread_unicode(path)
if img is None:
return 0, 0
h, w = img.shape[:2]
return int(h), int(w)
def file_md5(path: str, chunk_size: int = 1024 * 1024) -> str:
md5 = hashlib.md5()
with open(path, "rb") as f:
while True:
b = f.read(chunk_size)
if not b:
break
md5.update(b)
return md5.hexdigest()
def phash_64(path: str) -> Optional[int]:
"""
pHash 64-bit:灰度->32x32->DCT->取8x8低频->与中位数比较->64bit
"""
img = imread_unicode(path)
if img is None:
return None
try:
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.resize(gray, (32, 32), interpolation=cv2.INTER_AREA)
gray = np.float32(gray)
dct = cv2.dct(gray)
dct_low = dct[:8, :8].copy()
flat = dct_low.flatten()
flat_no_dc = flat[1:] # 去掉 DC
median = np.median(flat_no_dc)
bits = (flat > median).astype(np.uint8)
h = 0
for b in bits:
h = (h << 1) | int(b)
return int(h)
except Exception:
return None
def hamming_distance_64(a: int, b: int) -> int:
return int((a ^ b).bit_count())
# -----------------------------
# ORB 图像识别(关键点特征匹配)
# -----------------------------
_orb = None
_bf = None
_orb_cache: Dict[str, Tuple[int, Optional[np.ndarray]]] = {}
def _get_orb():
global _orb
if _orb is None:
_orb = cv2.ORB_create(nfeatures=int(ORB_NFEATURES))
return _orb
def _get_bf():
global _bf
if _bf is None:
_bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)
return _bf
def _resize_for_orb(img: np.ndarray) -> np.ndarray:
h, w = img.shape[:2]
m = max(h, w)
if m <= ORB_MAX_IMAGE_EDGE:
return img
scale = ORB_MAX_IMAGE_EDGE / float(m)
new_w = max(1, int(w * scale))
new_h = max(1, int(h * scale))
return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
def orb_signature(path: str) -> Tuple[int, Optional[np.ndarray]]:
"""
返回:(keypoints_count, descriptors)
descriptors 可能为 None(例如纯色图/读图失败)
"""
if path in _orb_cache:
return _orb_cache[path]
img = imread_unicode(path)
if img is None:
_orb_cache[path] = (0, None)
return (0, None)
try:
img = _resize_for_orb(img)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
orb = _get_orb()
kps, des = orb.detectAndCompute(gray, None)
kp_count = 0 if kps is None else len(kps)
_orb_cache[path] = (kp_count, des)
return _orb_cache[path]
except Exception:
_orb_cache[path] = (0, None)
return (0, None)
def orb_similarity(path_a: str, path_b: str) -> Tuple[int, float]:
"""
计算 ORB 相似度:
- good_matches:通过 ratio test 的好匹配数量
- similarity:good_matches / min(kp_count_a, kp_count_b)
"""
kp_a, des_a = orb_signature(path_a)
kp_b, des_b = orb_signature(path_b)
if des_a is None or des_b is None:
return 0, 0.0
if kp_a <= 0 or kp_b <= 0:
return 0, 0.0
try:
bf = _get_bf()
matches = bf.knnMatch(des_a, des_b, k=2)
good = 0
ratio = float(ORB_RATIO_TEST)
for m_n in matches:
if len(m_n) < 2:
continue
m, n = m_n[0], m_n[1]
if m.distance < ratio * n.distance:
good += 1
denom = max(1, min(kp_a, kp_b))
sim = good / float(denom)
return good, float(sim)
except Exception:
return 0, 0.0
@dataclass
class ImgInfo:
path: str
size: int
w: int
h: int
@property
def area(self) -> int:
return self.w * self.h
def choose_keep(a: ImgInfo, b: ImgInfo, prefer: str) -> Tuple[ImgInfo, ImgInfo]:
if prefer == "area":
if a.area != b.area:
return (a, b) if a.area > b.area else (b, a)
if a.size != b.size:
return (a, b) if a.size > b.size else (b, a)
return (a, b)
if prefer == "size":
if a.size != b.size:
return (a, b) if a.size > b.size else (b, a)
if a.area != b.area:
return (a, b) if a.area > b.area else (b, a)
return (a, b)
if prefer in {"newest", "oldest"}:
ta = os.path.getmtime(a.path)
tb = os.path.getmtime(b.path)
if prefer == "newest":
return (a, b) if ta >= tb else (b, a)
else:
return (a, b) if ta <= tb else (b, a)
return choose_keep(a, b, "area")
def iter_images(root: str, recursive: bool) -> List[str]:
root = os.path.abspath(root)
out = []
if recursive:
for dp, _, fns in os.walk(root):
for fn in fns:
ext = os.path.splitext(fn)[1].lower()
if ext in IMAGE_EXTS:
out.append(os.path.join(dp, fn))
else:
for fn in os.listdir(root):
p = os.path.join(root, fn)
if os.path.isfile(p):
ext = os.path.splitext(fn)[1].lower()
if ext in IMAGE_EXTS:
out.append(p)
return out
def ensure_dir(p: str) -> None:
os.makedirs(p, exist_ok=True)
def safe_move(src: str, dst: str) -> str:
base, ext = os.path.splitext(dst)
final = dst
i = 1
while os.path.exists(final):
final = f"{base}({i}){ext}"
i += 1
ensure_dir(os.path.dirname(final))
shutil.move(src, final)
return final
def write_report_csv(report_path: str, rows: List[dict]) -> None:
ensure_dir(os.path.dirname(report_path) if os.path.dirname(report_path) else ".")
fieldnames = [
"time", "mode",
"keep_path", "dup_path",
"reason", "value",
"phash_distance",
"orb_good_matches", "orb_similarity",
"keep_w", "keep_h", "keep_size",
"dup_w", "dup_h", "dup_size",
"dup_new_path",
]
with open(report_path, "w", newline="", encoding="utf-8-sig") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
for r in rows:
w.writerow(r)
def dedup_by_md5(paths: List[str], prefer_keep: str) -> List[Tuple[str, str, str]]:
"""
返回重复对:(dup_path, keep_path, md5)
"""
md5_map: Dict[str, ImgInfo] = {}
dups: List[Tuple[str, str, str]] = []
for p in tqdm(paths, desc="MD5 扫描", unit="img"):
try:
h = file_md5(p)
except Exception:
continue
try:
size = os.path.getsize(p)
except Exception:
size = 0
hh, ww = get_image_hw(p)
cur = ImgInfo(path=p, size=size, w=ww, h=hh)
if h not in md5_map:
md5_map[h] = cur
else:
old = md5_map[h]
keep, drop = choose_keep(old, cur, prefer_keep)
md5_map[h] = keep
dups.append((drop.path, keep.path, h))
return dups
def dedup_by_phash_with_orb(
paths: List[str],
prefer_keep: str,
phash_th: int,
use_orb: bool
) -> List[Tuple[str, str, str, int, int, float]]:
"""
返回重复对:
(dup_path, keep_path, phash_hex, phash_dist, orb_good, orb_sim)
说明:
- 先用 pHash 快速筛选候选(phash_dist <= phash_th)
- 若启用 ORB,则对候选进行二次"图像识别"确认
"""
hashes: Dict[str, int] = {}
infos: Dict[str, ImgInfo] = {}
for p in tqdm(paths, desc="pHash 计算", unit="img"):
ph = phash_64(p)
if ph is None:
continue
hashes[p] = ph
try:
size = os.path.getsize(p)
except Exception:
size = 0
hh, ww = get_image_hw(p)
infos[p] = ImgInfo(path=p, size=size, w=ww, h=hh)
# 分桶加速:用高16位分桶
buckets: Dict[int, List[str]] = {}
for p, ph in hashes.items():
key = (ph >> 48) & 0xFFFF
buckets.setdefault(key, []).append(p)
keep_map: Dict[str, str] = {p: p for p in hashes.keys()}
dups: List[Tuple[str, str, str, int, int, float]] = []
for key in tqdm(list(buckets.keys()), desc="pHash 分桶比对", unit="bucket"):
group = buckets[key]
if len(group) <= 1:
continue
for i in range(len(group)):
a = group[i]
a_keep = keep_map.get(a, a)
if a_keep not in hashes:
continue
for j in range(i + 1, len(group)):
b = group[j]
b_keep = keep_map.get(b, b)
if b_keep not in hashes:
continue
ha = hashes[a_keep]
hb = hashes[b_keep]
dist = hamming_distance_64(ha, hb)
# pHash 先筛候选
if dist > phash_th:
continue
orb_good = 0
orb_sim = 0.0
if use_orb:
orb_good, orb_sim = orb_similarity(a_keep, b_keep)
# ORB 二次确认门槛:两个条件都满足才算重复
if orb_good < int(ORB_MIN_GOOD_MATCHES):
continue
if orb_sim < float(ORB_SIMILARITY_TH):
continue
ia = infos.get(a_keep)
ib = infos.get(b_keep)
if ia is None or ib is None:
continue
keep, drop = choose_keep(ia, ib, prefer_keep)
keep_map[keep.path] = keep.path
keep_map[drop.path] = keep.path
ph_hex = f"{hashes[keep.path]:016x}"
dups.append((drop.path, keep.path, ph_hex, dist, orb_good, orb_sim))
return dups
def main():
root = os.path.abspath(ROOT_DIR)
if not os.path.isdir(root):
print(f"❌ 目录不存在:{root}")
return
t0 = time.time()
paths = iter_images(root, RECURSIVE)
if not paths:
print("⚠️ 未找到图片文件。")
return
duplicates_dir = os.path.join(root, MOVE_TO)
report_path = os.path.join(root, REPORT_NAME)
mode = MODE.strip().lower()
use_orb = (("orb" in mode) and USE_ORB)
print("=================================================")
print("📁 根目录:", root)
print("🧾 图片数量:", len(paths))
print(f"🧠 模式:{MODE} | pHash阈值:{PHASH_TH} | 保留策略:{PREFER_KEEP}")
print(f"🧷 动作:{ACTION} | dry-run:{DRY_RUN}")
if use_orb:
print("🧠 ORB 图像识别:启用")
print(f" ORB_NFEATURES={ORB_NFEATURES}, RATIO={ORB_RATIO_TEST}, MIN_GOOD={ORB_MIN_GOOD_MATCHES}, SIM_TH={ORB_SIMILARITY_TH}")
else:
print("🧠 ORB 图像识别:未启用")
print("=================================================")
report_rows: List[dict] = []
processed = 0
# 1) MD5
md5_dups: List[Tuple[str, str, str]] = []
remain_after_md5 = paths
if mode in {"md5", "md5+phash", "md5+phash+orb"} or ("md5" in mode):
md5_dups = dedup_by_md5(paths, PREFER_KEEP)
dup_set = set([d[0] for d in md5_dups])
remain_after_md5 = [p for p in paths if p not in dup_set]
# 2) pHash (+ ORB)
phash_dups: List[Tuple[str, str, str, int, int, float]] = []
if ("phash" in mode):
phash_dups = dedup_by_phash_with_orb(
remain_after_md5,
prefer_keep=PREFER_KEEP,
phash_th=PHASH_TH,
use_orb=use_orb
)
def do_action(dup_path: str) -> str:
nonlocal processed
if DRY_RUN or ACTION == "none":
return ""
if not os.path.exists(dup_path):
return ""
if ACTION == "delete":
try:
os.remove(dup_path)
processed += 1
return ""
except Exception:
return ""
if ACTION == "move":
try:
rel = os.path.relpath(dup_path, root)
dst = os.path.join(duplicates_dir, rel)
newp = safe_move(dup_path, dst)
processed += 1
return newp
except Exception:
return ""
return ""
# 记录 MD5
for dup_path, keep_path, md5v in md5_dups:
if not os.path.exists(dup_path):
continue
keep_h, keep_w = get_image_hw(keep_path)
dup_h, dup_w = get_image_hw(dup_path)
keep_size = os.path.getsize(keep_path) if os.path.exists(keep_path) else 0
dup_size = os.path.getsize(dup_path) if os.path.exists(dup_path) else 0
new_path = do_action(dup_path)
report_rows.append({
"time": time.strftime("%Y-%m-%d %H:%M:%S"),
"mode": "md5",
"keep_path": keep_path,
"dup_path": dup_path,
"reason": "MD5 完全重复",
"value": md5v,
"phash_distance": "",
"orb_good_matches": "",
"orb_similarity": "",
"keep_w": keep_w, "keep_h": keep_h, "keep_size": keep_size,
"dup_w": dup_w, "dup_h": dup_h, "dup_size": dup_size,
"dup_new_path": new_path,
})
# 记录 pHash(+ORB)
for dup_path, keep_path, ph_hex, ph_dist, orb_good, orb_sim in phash_dups:
if not os.path.exists(dup_path):
continue
keep_h, keep_w = get_image_hw(keep_path)
dup_h, dup_w = get_image_hw(dup_path)
keep_size = os.path.getsize(keep_path) if os.path.exists(keep_path) else 0
dup_size = os.path.getsize(dup_path) if os.path.exists(dup_path) else 0
new_path = do_action(dup_path)
reason = "pHash 近似重复"
if use_orb:
reason = "pHash + ORB 图像识别确认"
report_rows.append({
"time": time.strftime("%Y-%m-%d %H:%M:%S"),
"mode": "phash+orb" if use_orb else "phash",
"keep_path": keep_path,
"dup_path": dup_path,
"reason": reason,
"value": ph_hex,
"phash_distance": ph_dist,
"orb_good_matches": orb_good if use_orb else "",
"orb_similarity": f"{orb_sim:.4f}" if use_orb else "",
"keep_w": keep_w, "keep_h": keep_h, "keep_size": keep_size,
"dup_w": dup_w, "dup_h": dup_h, "dup_size": dup_size,
"dup_new_path": new_path,
})
write_report_csv(report_path, report_rows)
dt = time.time() - t0
print("")
print("✅ 完成")
print(f"📄 报告:{report_path}")
if ACTION == "move":
print(f"📦 重复文件目录:{duplicates_dir}(保持原目录结构)")
print(f"🧮 识别重复条目:{len(report_rows)}")
if DRY_RUN or ACTION == "none":
print("🧪 dry-run/none:未移动/未删除任何文件")
else:
print(f"🧹 实际处理文件数(移动/删除):{processed}")
print(f"⏱ 用时:{dt:.2f} 秒")
if __name__ == "__main__":
main()