将.docx文件拉进下面代码文件.py中会字段转化成对应的html
代码:
python
"""
DOCX 转 HTML 转换脚本
使用 mammoth 库将 .docx 文件转换为 HTML,保留格式
"""
# ============================================================
# 环境要求
# ============================================================
# Python 版本:3.7 及以上(推荐 3.8+)
#
# 依赖库:
# - mammoth >= 1.5 (将 docx 转为 HTML)
#
# 安装依赖:
# pip install mammoth
#
# ============================================================
# 使用流程
# ============================================================
# 1. 将需要转换的 .docx 文件放到与本脚本相同的目录下
# 2. 打开终端,进入脚本所在目录:
# cd c:\Users\Administrator\Desktop\word
# 3. 执行转换:
# python convert_docx_to_html.py
# 4. 转换完成后,HTML 文件会自动生成在同级目录下
# (文件名与 docx 相同,扩展名为 .html)
# 注意:脚本会自动扫描同目录下所有 .docx 文件,无需手动指定
# ============================================================
import glob
import os
import sys
import mammoth
# 工作目录
WORK_DIR = os.path.dirname(os.path.abspath(__file__))
def find_docx_files(directory: str) -> list:
"""自动扫描目录下所有 .docx 文件(排除脚本自身生成的临时文件)"""
pattern = os.path.join(directory, "*.docx")
files = glob.glob(pattern)
# 排除隐藏文件和临时文件(如 ~$ 开头的 Word 临时文件)
return [f for f in files if not os.path.basename(f).startswith("~$")]
def convert_docx_to_html(docx_path: str, html_path: str):
"""将单个 docx 文件转换为 HTML"""
with open(docx_path, "rb") as docx_file:
# 使用 mammoth 转换,自定义样式映射以保留更多格式
result = mammoth.convert_to_html(
docx_file,
style_map="""
p[style-name='Heading 1'] => h1:fresh
p[style-name='Heading 2'] => h2:fresh
p[style-name='Heading 3'] => h3:fresh
p[style-name='Heading 4'] => h4:fresh
r[style-name='Strong'] => strong
r[style-name='Emphasis'] => em
"""
)
html_body = result.value
messages = result.messages
# 如果有转换警告,打印出来
if messages:
print(f" 转换警告 ({os.path.basename(docx_path)}):")
for msg in messages:
print(f" - {msg}")
# 生成完整的 HTML 文档,包含基础样式
html_full = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{os.path.splitext(os.path.basename(docx_path))[0]}</title>
<style>
body {{
font-family: "Microsoft YaHei", "SimSun", "Segoe UI", Arial, sans-serif;
max-width: 800px;
margin: 40px auto;
padding: 20px;
line-height: 1.8;
color: #333;
background-color: #fff;
}}
h1, h2, h3, h4, h5, h6 {{
color: #1a1a1a;
margin-top: 1.5em;
margin-bottom: 0.8em;
}}
p {{
margin: 0.8em 0;
text-indent: 0;
}}
table {{
border-collapse: collapse;
width: 100%;
margin: 1em 0;
}}
th, td {{
border: 1px solid #ccc;
padding: 8px 12px;
text-align: left;
}}
th {{
background-color: #f5f5f5;
}}
strong {{
font-weight: bold;
}}
em {{
font-style: italic;
}}
ul, ol {{
padding-left: 2em;
}}
</style>
</head>
<body>
{html_body}
</body>
</html>"""
with open(html_path, "w", encoding="utf-8") as f:
f.write(html_full)
print(f" [OK] 已生成: {os.path.basename(html_path)}")
def main():
print("=" * 50)
print("DOCX 转 HTML 转换工具")
print("=" * 50)
success_count = 0
fail_count = 0
# 自动扫描同目录下所有 .docx 文件
docx_files = find_docx_files(WORK_DIR)
if not docx_files:
print("当前目录下未找到任何 .docx 文件")
return
print(f"发现 {len(docx_files)} 个 .docx 文件\n")
for docx_path in docx_files:
docx_name = os.path.basename(docx_path)
html_name = os.path.splitext(docx_name)[0] + ".html"
html_path = os.path.join(WORK_DIR, html_name)
try:
print(f"\n转换: {docx_name}")
convert_docx_to_html(docx_path, html_path)
success_count += 1
except Exception as e:
print(f" [FAIL] 转换失败: {e}")
fail_count += 1
print(f"\n{'=' * 50}")
print(f"转换完成: 成功 {success_count} 个, 失败 {fail_count} 个")
print(f"{'=' * 50}")
if fail_count > 0:
sys.exit(1)
if __name__ == "__main__":
main()