python将word文档转化成html

将.docx文件拉进下面代码文件.py中会字段转化成对应的html
代码：
python 复制代码
"""
DOCX 转 HTML 转换脚本
使用 mammoth 库将 .docx 文件转换为 HTML，保留格式
"""

# ============================================================
# 环境要求
# ============================================================
# Python 版本：3.7 及以上（推荐 3.8+）
#
# 依赖库：
#   - mammoth  >= 1.5  （将 docx 转为 HTML）
#
# 安装依赖：
#   pip install mammoth
#
# ============================================================
# 使用流程
# ============================================================
# 1. 将需要转换的 .docx 文件放到与本脚本相同的目录下
# 2. 打开终端，进入脚本所在目录：
#      cd c:\Users\Administrator\Desktop\word
# 3. 执行转换：
#      python convert_docx_to_html.py
# 4. 转换完成后，HTML 文件会自动生成在同级目录下
#    （文件名与 docx 相同，扩展名为 .html）
# 注意：脚本会自动扫描同目录下所有 .docx 文件，无需手动指定
# ============================================================

import glob
import os
import sys

import mammoth

# 工作目录
WORK_DIR = os.path.dirname(os.path.abspath(__file__))


def find_docx_files(directory: str) -> list:
    """自动扫描目录下所有 .docx 文件（排除脚本自身生成的临时文件）"""
    pattern = os.path.join(directory, "*.docx")
    files = glob.glob(pattern)
    # 排除隐藏文件和临时文件（如 ~$ 开头的 Word 临时文件）
    return [f for f in files if not os.path.basename(f).startswith("~$")]


def convert_docx_to_html(docx_path: str, html_path: str):
    """将单个 docx 文件转换为 HTML"""
    with open(docx_path, "rb") as docx_file:
        # 使用 mammoth 转换，自定义样式映射以保留更多格式
        result = mammoth.convert_to_html(
            docx_file,
            style_map="""
                p[style-name='Heading 1'] => h1:fresh
                p[style-name='Heading 2'] => h2:fresh
                p[style-name='Heading 3'] => h3:fresh
                p[style-name='Heading 4'] => h4:fresh
                r[style-name='Strong'] => strong
                r[style-name='Emphasis'] => em
            """
        )
        html_body = result.value
        messages = result.messages

        # 如果有转换警告，打印出来
        if messages:
            print(f"  转换警告 ({os.path.basename(docx_path)}):")
            for msg in messages:
                print(f"    - {msg}")

    # 生成完整的 HTML 文档，包含基础样式
    html_full = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{os.path.splitext(os.path.basename(docx_path))[0]}</title>
    <style>
        body {{
            font-family: "Microsoft YaHei", "SimSun", "Segoe UI", Arial, sans-serif;
            max-width: 800px;
            margin: 40px auto;
            padding: 20px;
            line-height: 1.8;
            color: #333;
            background-color: #fff;
        }}
        h1, h2, h3, h4, h5, h6 {{
            color: #1a1a1a;
            margin-top: 1.5em;
            margin-bottom: 0.8em;
        }}
        p {{
            margin: 0.8em 0;
            text-indent: 0;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
            margin: 1em 0;
        }}
        th, td {{
            border: 1px solid #ccc;
            padding: 8px 12px;
            text-align: left;
        }}
        th {{
            background-color: #f5f5f5;
        }}
        strong {{
            font-weight: bold;
        }}
        em {{
            font-style: italic;
        }}
        ul, ol {{
            padding-left: 2em;
        }}
    </style>
</head>
<body>
{html_body}
</body>
</html>"""

    with open(html_path, "w", encoding="utf-8") as f:
        f.write(html_full)

    print(f"  [OK] 已生成: {os.path.basename(html_path)}")


def main():
    print("=" * 50)
    print("DOCX 转 HTML 转换工具")
    print("=" * 50)

    success_count = 0
    fail_count = 0

    # 自动扫描同目录下所有 .docx 文件
    docx_files = find_docx_files(WORK_DIR)

    if not docx_files:
        print("当前目录下未找到任何 .docx 文件")
        return

    print(f"发现 {len(docx_files)} 个 .docx 文件\n")

    for docx_path in docx_files:
        docx_name = os.path.basename(docx_path)
        html_name = os.path.splitext(docx_name)[0] + ".html"
        html_path = os.path.join(WORK_DIR, html_name)

        try:
            print(f"\n转换: {docx_name}")
            convert_docx_to_html(docx_path, html_path)
            success_count += 1
        except Exception as e:
            print(f"  [FAIL] 转换失败: {e}")
            fail_count += 1

    print(f"\n{'=' * 50}")
    print(f"转换完成: 成功 {success_count} 个, 失败 {fail_count} 个")
    print(f"{'=' * 50}")

    if fail_count > 0:
        sys.exit(1)


if __name__ == "__main__":
    main()