先前研究过 https://github.com/xylcbd/EasyCNN 仓库的git提交历史,发现有很多重复的commit,当时是用 smartgit 这一 GUI 软件查看 commit tree,手工删除重复 commit ,挺累的。
最近发现了 git format-patch 和 git am 命令,对于清理 EasyCNN 的重复的 git 提交历史很有帮助。这里记录下,真的挺有意思。
大体思路:生成原仓库每一 commit 的patch,按编号扫描 patch 文件,相邻 patch 文件如果名字相同(不考虑patch编号),并且 commit 时间相同,并且git diff相同,那就认为是重复的commit,仅留下第一个; 在新建目录创建仓库,逐一应用留下来的patch。
具体命令是:
bash
cd EasyCNN
# 生成 patch
git format-patch --root
cd ..
mkdir ecnn
git am ../EasyCNN/0001-initialize-repo.patch
git am ../EasyCNN/0003-init-cnn-framework.patch
...
git am ../EasyCNN/0103-Update-PoolingLayer.cpp.patch
发现逐个patch上执行 git am 的过程很重复,于是让千问写了python脚本,也便于后人重复执行:
其中检查的时间复杂度是O(N), 只考虑相邻commit是相同commit的情况
python
#!/usr/bin/env python3
import sys
import re
import subprocess
from pathlib import Path
def extract_subject_from_filename(filename: str) -> str:
stem = Path(filename).stem
return re.sub(r'^\d+-', '', stem)
def extract_diff_content(patch_file):
with open(patch_file, 'r', encoding='utf-8', errors='replace') as f:
lines = f.readlines()
diff_lines = []
in_diff = False
for line in lines:
if line.startswith('diff --git'):
in_diff = True
if in_diff:
diff_lines.append(line)
return ''.join(diff_lines).strip()
def deduplicate_patches_linear(patch_dir: Path, output_dir: Path):
# 按文件名排序(0001, 0002, ...)
patches = sorted(patch_dir.glob("*.patch"), key=lambda x: x.name)
if not patches:
print("❌ 没有找到 .patch 文件")
return []
unique_patches = []
prev_subject = None
prev_diff = None
for p in patches:
try:
subject = extract_subject_from_filename(p.name)
diff = extract_diff_content(p)
# 如果和前一个 patch 主题相同 且 diff 相同 → 跳过
if subject == prev_subject and diff == prev_diff:
print(f"⏭️ 跳过重复 patch: {p.name}")
continue
# 否则保留
unique_patches.append(p)
prev_subject = subject
prev_diff = diff
except Exception as e:
print(f"⚠️ 跳过损坏文件 {p}: {e}")
# 输出去重结果
output_dir.mkdir(exist_ok=True)
final_list = []
for i, p in enumerate(unique_patches, 1):
new_name = f"{i:04d}_{p.name}"
dst = output_dir / new_name
dst.write_text(p.read_text(encoding='utf-8', errors='replace'), encoding='utf-8')
final_list.append(dst)
print(f"✅ 原始 {len(patches)} 个 patch,去重后剩 {len(final_list)} 个")
return final_list
def apply_patches_in_new_repo(patch_files, target_repo_dir):
target = Path(target_repo_dir)
if target.exists():
print(f"❌ 目标目录 {target} 已存在,请先删除")
return False
subprocess.run(["git", "init", str(target)], check=True)
for patch in patch_files:
print(f"📦 应用 {patch.name} ...")
result = subprocess.run(
["git", "-C", str(target), "am", str(patch)],
capture_output=True,
text=True
)
if result.returncode != 0:
print(f"❌ 应用失败: {patch}")
print(result.stderr)
return False
print(f"🎉 全部 {len(patch_files)} 个 patch 应用成功!仓库位于: {target}")
return True
def main():
if len(sys.argv) != 3:
print("用法: python3 dedup.py <patch_dir> <new_repo_dir>")
sys.exit(1)
patch_dir = Path(sys.argv[1]).resolve()
new_repo_dir = Path(sys.argv[2]).resolve()
if not patch_dir.is_dir():
print(f"❌ patch 目录不存在: {patch_dir}")
sys.exit(1)
dedup_dir = patch_dir.parent / "deduplicated_patches"
unique_patches = deduplicate_patches_linear(patch_dir, dedup_dir)
if not unique_patches:
sys.exit(1)
apply_patches_in_new_repo(unique_patches, new_repo_dir)
if __name__ == "__main__":
main()