问题描述:Registry 中存储的镜像数量过多,占用了大量磁盘空间,最终导致磁盘使用率达到 100%,造成服务异常(如无法推送新镜像、拉取镜像超时等)。

解决方案代码逻辑:

  1. 查询待清理镜像 :从数据库获取所有已标记为软删除(is_deleted = 1)且创建时间超过指定天数的镜像记录,生成待清理清单。
  2. 安全检查:对于每个待清理镜像,通过 Registry API 获取其 manifest digest,并检查该 digest 是否被多个 tag 引用。只有当引用数为 1(即该 manifest 仅被当前 tag 使用)时,才执行删除操作,避免误删仍被其他 tag 依赖的镜像。
  3. 删除 manifest :调用 Registry API 的 DELETE /v2/<name>/manifests/<digest> 接口,删除镜像的 manifest 文件。
  4. 释放存储空间:删除 manifest 后,镜像的底层层(blob)并不会立即删除。需要手动运行 Registry 自带的垃圾回收(GC)命令,根据引用计数清理不再被任何 manifest 引用的 blob,从而真正释放磁盘空间。
  5. 共享层保护:如果多个镜像共享相同的基础层,删除其中一个镜像的 manifest 不会影响其他镜像对该基础层的引用。GC 执行时会保留引用计数大于 0 的 blob,确保共享层不被误删。

总结

  • 删除操作 删除的是 manifest 文件(相当于镜像的目录清单),不是直接删层(blob)。
  • 手动 GC 才会真正删除不再被任何 manifest 引用的 blob。
  • Registry 维护引用计数:每个 blob 被哪些 manifest 引用。共享层(如基础层 L)只要还有至少一个 manifest 引用它,GC 就不会删除它。
  • 每层有唯一的内容摘要(digest)
  • 整个镜像也有一个唯一的 digest,即 manifest digest。

完整代码

import pymysql

from datetime import datetime, timedelta

import requests

import logging

import argparse

import os

Registry 配置(请根据实际环境修改)

registry_url = "http://your-registry-host:port"

registry_host = "your-registry-host:port"

def setup_logger():

logger = logging.getLogger("image_cleanup")

logger.setLevel(logging.DEBUG)

清除旧的 handler 防止重复

if logger.hasHandlers():

logger.handlers.clear()

file_handler = logging.FileHandler("image_cleanup.log", encoding="utf-8")

file_handler.setLevel(logging.DEBUG)

console_handler = logging.StreamHandler()

console_handler.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

file_handler.setFormatter(formatter)

console_handler.setFormatter(formatter)

logger.addHandler(file_handler)

logger.addHandler(console_handler)

return logger

logger = setup_logger()

def connect_to_db():

try:

conn = pymysql.connect(

host='your-db-host',

user='your-db-user',

password='your-db-password',

database='your-db-name',

charset='utf8mb4'

)

return conn

except Exception as e:

logger.error(f"Failed to connect to the database: {e}")

exit(1)

def query_images(days):

conn = connect_to_db()

cursor = conn.cursor()

query = """

SELECT name, project_name,

DATE_ADD(create_time, INTERVAL 8 HOUR) as adjusted_create_time,

user_name, real_tag

FROM image

WHERE is_deleted = 1

"""

params = []

if days > 0:

date_threshold = datetime.now() - timedelta(days=days)

date_threshold_str = date_threshold.strftime('%Y-%m-%d %H:%M:%S')

query += " AND create_time < %s"

params.append(date_threshold_str)

try:

cursor.execute(query, params)

results = cursor.fetchall()

except Exception as e:

logger.error(f"Failed to execute query: {e}")

results = []

finally:

cursor.close()

conn.close()

return results

def get_tag_digest(repo, tag):

url = f"{registry_url}/v2/{repo}/manifests/{tag}"

headers = {"Accept": "application/vnd.docker.distribution.manifest.v2+json, application/vnd.oci.image.manifest.v1+json"}

try:

resp = requests.get(url, headers=headers, timeout=5)

if resp.status_code == 200:

return resp.headers.get("Docker-Content-Digest")

else:

logger.debug(f"Get digest failed for {repo}:{tag}, status: {resp.status_code}")

except Exception as e:

logger.error(f"Request error for {repo}:{tag}: {e}")

return None

def get_all_tags(repo):

url = f"{registry_url}/v2/{repo}/tags/list"

try:

resp = requests.get(url, timeout=5)

if resp.status_code == 200:

return resp.json().get("tags", [])

except Exception as e:

logger.error(f"Failed to get tags for {repo}: {e}")

return []

def get_digest_reference_count(repo, digest):

if not digest: return 0

tags = get_all_tags(repo)

count = 0

for tag in tags:

if get_tag_digest(repo, tag) == digest:

count += 1

return count

def safe_delete_image(repo, tag):

logger.info(f"Attempting to delete: {repo}:{tag}")

digest = get_tag_digest(repo, tag)

if not digest:

logger.warning(f"Cannot get digest for {repo}:{tag}, skipping.")

return False

检查是否有其他标签引用同一个镜像层

ref_count = get_digest_reference_count(repo, digest)

if ref_count > 1:

logger.warning(f"Digest {digest} is referenced by {ref_count} tags, skipping deletion of {repo}:{tag}")

return False

执行删除

delete_url = f"{registry_url}/v2/{repo}/manifests/{digest}"

logger.debug(f"Delete URL: {delete_url}")

try:

resp = requests.delete(delete_url)

if resp.status_code in (200, 202):

return True

else:

logger.error(f"Delete API returned status {resp.status_code}: {resp.text}")

return False

except Exception as e:

logger.error(f"Delete request failed: {e}")

return False

def parse_image_string(full_image_string):

"""

解析类似 your-registry-host:port/my-nginx:v1.0 的字符串

返回 (repo, tag)

"""

try:

1. 去掉可能存在的协议头 (http://)

if full_image_string.startswith("http"):

full_image_string = full_image_string.split("//", 1)[1]

2. 分割 域名/仓库路径

if '/' not in full_image_string:

return None, None

_, path_part = full_image_string.split('/', 1)

3. 分割 仓库名:标签

if ':' not in path_part:

return path_part, 'latest'

repo, tag = path_part.rsplit(':', 1)

return repo, tag

except Exception as e:

logger.error(f"Failed to parse image string '{full_image_string}': {e}")

return None, None

def main():

parser = argparse.ArgumentParser(description="Image Cleanup Script")

parser.add_argument("action", choices=["list", "rm"], help="Action to perform: list or rm")

parser.add_argument("param", help="Days for 'list' or file path for 'rm'")

args = parser.parse_args()

if args.action == "list":

days = int(args.param)

results = query_images(days)

with open('output.txt', 'w', encoding='utf-8') as file:

for row in results:

name, project_name, adjusted_create_time, user_name, real_tag = row

repository = name

tag = real_tag

name_with_prefix = f"{registry_host}/{repository}:{tag}"

line = '\t'.join([name_with_prefix, str(project_name), str(adjusted_create_time), str(user_name)])

file.write(line + '\n')

logger.info(f"Listed image: {name_with_prefix}")

elif args.action == "rm":

file_path = args.param

if not os.path.isfile(file_path):

logger.error(f"File not found: {file_path}")

return

with open(file_path, 'r', encoding='utf-8') as file:

for line in file:

line = line.strip()

if not line:

continue

parts = line.split('\t')

full_image = parts[0]

logger.debug(f"Processing line: {line}")

logger.debug(f"Extracted image: {full_image}")

repository, tag = parse_image_string(full_image)

if not repository or not tag:

logger.warning(f"Invalid image format in line: {line}")

continue

if safe_delete_image(repository, tag):

logger.info(f"Successfully deleted: {full_image}")

else:

logger.warning(f"Failed to delete: {full_image}")

if name == "main":

main()

相关推荐
樽酒ﻬق4 小时前
构筑容器化基石:Docker 稳定版本抉择、极速安装与配置全解
java·docker·运维开发
做cv的小昊3 天前
【conda】打包已有conda环境并在其他服务器上搭建
运维·服务器·python·conda·运维开发·pip·开发
运维行者_9 天前
金融和电商行业如何使用网络监控保障业务稳定?
开发语言·网络·人工智能·安全·web安全·机器学习·运维开发
剑飞的编程思维14 天前
电商系统三类迭代方案评审重点
学习·系统架构·自动化·运维开发·学习方法
钰衡大师17 天前
Nohup 使用技术文档
linux·服务器·运维开发·unix
hzhsec20 天前
AI Security Agent:用自然语言做安全巡检,AI 自己跑命令
人工智能·安全·运维开发·ai编程
yeapT22 天前
宝塔部署各类项目踩坑,持续更新
运维开发
苦逼IT运维22 天前
SVN 仓库目录迁移,仓库 “降级” 成子目录实战
linux·运维·ci/cd·svn·运维开发
慧天城寻1 个月前
H3C巡检命令与避坑技巧
运维·网络·运维开发