html文件cdn一键下载并替换

业务场景:

AI生成的html文件,通常会使用多个cdn资源、手动替换or下载太过麻烦、如下py程序为此而生,指定html目录自动下载并替换~

bash 复制代码
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import hashlib

class CDNDownloader:
    def __init__(self, html_dir, static_dir='static'):
        self.html_dir = os.path.abspath(html_dir)
        self.static_dir = os.path.join(self.html_dir, static_dir)
        os.makedirs(self.static_dir, exist_ok=True)
        
    def process_all_html_files(self):
        """处理指定目录下的所有HTML文件"""
        for root, _, files in os.walk(self.html_dir):
            for file in files:
                if file.endswith(('.html', '.htm')):
                    file_path = os.path.join(root, file)
                    print(f"Processing {file_path}...")
                    self.process_single_html(file_path)

    def process_single_html(self, html_path):
        """处理单个HTML文件"""
        with open(html_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        soup = BeautifulSoup(content, 'html.parser')
        
        # 计算HTML文件相对于根目录的层级
        rel_path = os.path.relpath(os.path.dirname(html_path), self.html_dir)
        path_prefix = '../' * len(rel_path.split(os.sep)) if rel_path != '.' else ''
        
        # 处理CSS文件
        for link in soup.find_all('link', rel='stylesheet'):
            if 'href' in link.attrs:
                old_url = link['href']
                if self._is_cdn_url(old_url):
                    new_url = self.download_resource(old_url, 'css')
                    # 根据HTML文件的位置调整相对路径
                    link['href'] = f"{path_prefix}{new_url}"
        
        # 处理JS文件
        for script in soup.find_all('script', src=True):
            old_url = script['src']
            if self._is_cdn_url(old_url):
                new_url = self.download_resource(old_url, 'js')
                # 根据HTML文件的位置调整相对路径
                script['src'] = f"{path_prefix}{new_url}"
        
        # 保存修改后的HTML文件
        with open(html_path, 'w', encoding='utf-8') as f:
            f.write(str(soup))

    def _is_cdn_url(self, url):
        """判断是否为CDN URL"""
        return url.startswith(('http://', 'https://', '//'))

    def download_resource(self, url, resource_type):
        """下载资源并返回本地路径"""
        if url.startswith('//'):
            url = 'https:' + url
            
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                # 使用URL的最后部分作为文件名,如果没有扩展名则添加
                filename = os.path.basename(urlparse(url).path)
                if not filename:
                    # 如果URL没有文件名,使用URL的MD5作为文件名
                    filename = hashlib.md5(url.encode()).hexdigest()
                    filename = f"{filename}.{resource_type}"
                
                # 确保文件有正确的扩展名
                if not filename.endswith(f'.{resource_type}'):
                    filename = f"{filename}.{resource_type}"
                
                # 创建资源类型子目录
                resource_dir = os.path.join(self.static_dir, resource_type)
                os.makedirs(resource_dir, exist_ok=True)
                
                file_path = os.path.join(resource_dir, filename)
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                
                # 返回相对于static目录的路径
                return os.path.join('static', resource_type, filename).replace('\\', '/')
            
        except Exception as e:
            print(f"Error downloading {url}: {str(e)}")
            return url
        
        return url

def process_directory(html_dir, static_dir='static'):
    """直接处理指定目录的便捷函数"""
    downloader = CDNDownloader(html_dir, static_dir)
    downloader.process_all_html_files()

def main():
    # 方式1:命令行参数
    try:
        import argparse
        parser = argparse.ArgumentParser(description='Download CDN resources from HTML files')
        parser.add_argument('html_dir', help='Directory containing HTML files')
        parser.add_argument('--static-dir', default='static', help='Directory to save downloaded resources')
        
        args = parser.parse_args()
        process_directory(args.html_dir, args.static_dir)
        
    except SystemExit:
        # 方式2:写死在代码中的路径
        html_directories = [
            "templates",  # 示例路径1
        ]
        
        for directory in html_directories:
            print(f"\n处理目录: {directory}")
            process_directory(directory)
    
    print("Done! All CDN resources have been downloaded and HTML files updated.")

if __name__ == '__main__':
    main() 
相关推荐
Ulyanov2 小时前
高保真单脉冲雷达导引头回波生成:Python建模与实践
开发语言·python·仿真·系统设计·单脉冲雷达
Li emily2 小时前
成功接入A股实时行情API获取实时市场数据
人工智能·python·金融·fastapi
糕冷小美n2 小时前
elementuivue2表格不覆盖整个表格添加固定属性
前端·javascript·elementui
小哥不太逍遥2 小时前
Technical Report 2024
java·服务器·前端
沐墨染2 小时前
黑词分析与可疑对话挖掘组件的设计与实现
前端·elementui·数据挖掘·数据分析·vue·visual studio code
anOnion3 小时前
构建无障碍组件之Disclosure Pattern
前端·html·交互设计
threerocks3 小时前
前端将死,Agent 永生
前端·人工智能·ai编程
shehuiyuelaiyuehao3 小时前
22Java对象的比较
java·python·算法
张小凡vip3 小时前
Python异步编程实战:基于async/await的高并发实现
开发语言·python
问道飞鱼3 小时前
【前端知识】Vite用法从入门到实战
前端·vite·项目构建