doc文档转换为html文档

最近需要将一批协议文档转换为HTML文档，并希望实现HTML表单填写后能够自动预览和导出PDF文档。但文档数量较多，手动编写代码不太现实。以下是两种思路：

思路一

将文档发送给AI大模型，让其识别并标注出表单，然后手动编写代码将表单转换为HTML。然而，文档中包含日期、括号、勾选框等特殊表单元素，转换代码需要逐个识别这些元素，且文档中存在大量空格（尤其是单空格），AI大模型在识别时经常出现错误或不准确的情况。

思路二

先将doc文档转换为txt文件，再利用AI大模型将txt文件直接转换为HTML。这种方式的优点是，在转换前可以复核文档内容，对错误部分进行修改后再进行转换，从而提高识别的准确性。优化后的语句更加清晰、简洁，逻辑也更加连贯。

以下给出代码

doc to txt

以下使用Document插件将doc转换成txt

python 复制代码

import sys
from docx import Document
from html import escape

def convert_docx_to_html(docx_path, html_path):
    # 打开 Word 文档
    doc = Document(docx_path)

    # 创建一个 HTML 文件并写入基本的 HTML 结构
    with open(html_path, 'w', encoding='utf-8') as html_file:
        html_file.write('<html>\n<head>\n<meta charset="utf-8">\n<title>Document</title>\n</head>\n<body>\n')

        # 遍历文档中的段落
        for para in doc.paragraphs:
            # 将段落内容转换为 HTML，并进行转义以防止 HTML 注入
            html_file.write(f'<p>{escape(para.text)}</p>\n')

        # 结束 HTML 结构
        html_file.write('</body>\n</html>')

def main():
    # 检查命令行参数
    if len(sys.argv) != 3:
        print("Usage: python3 ht.py <input_docx> <output_html>")
        sys.exit(1)

    # 获取输入和输出文件路径
    input_docx = sys.argv[1]
    output_html = sys.argv[2]

    # 执行转换
    convert_docx_to_html(input_docx, output_html)
    print(f"Converted {input_docx} to {output_html}")

if __name__ == "__main__":
    main()

txt to html

以下使用的是华为云openai接口，也可以使用siliconflow的api。前两天调用siliconflow的api正常，今天总是超时，所以更换了华为云的接口。

命令 python txt_to_html.py D:\your\folder\path 。命令 python txt_to_html.py # 解析当前路径文档。

注意：可以根据自己的需要，修改提示词，

xml 复制代码

import os
import json
import time
import re
from openai import OpenAI
import argparse

class SiliconFlowConverter:
    def __init__(self):
        self.api_key = "华为云ai密钥"
        self.model = "DeepSeek-V3"
        self.base_url = "https://maas-cn-southwest-2.modelarts-maas.com/v1/infers/271c9332-4aa6-4ff5-95b3-0cf8bd94c394/v1"
        self.client = OpenAI(
            api_key=self.api_key,
            base_url=self.base_url
        )
        
    def clean_html_content(self, content):
        try:
            # 移除可能的markdown代码块标记
            content = re.sub(r'```html\s*', '', content)
            content = re.sub(r'```\s*$', '', content)
            
            # 查找第一个HTML标签的开始位置
            html_start = content.find('<html')
            if html_start == -1:
                html_start = content.find('<!DOCTYPE html>')
            
            if html_start == -1:
                print("警告：未找到HTML标签，尝试直接处理内容")
                # 尝试添加基本的HTML结构
                content = f'''<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>转换文档</title>
    <style>
        /* 表单样式优化 */
        input[type="text"],
        input[type="number"],
        input[type="date"],
        select,
        textarea {{
            width: 100%;
            padding: 8px 12px;
            margin: 4px 0;
            border: none;
            border-radius: 4px;
            background-color: #f5f5f5;
            font-size: 14px;
            transition: all 0.3s ease;
        }}
        
        input[type="text"]:focus,
        input[type="number"]:focus,
        input[type="date"]:focus,
        select:focus,
        textarea:focus {{
            outline: none;
            background-color: #ffffff;
            box-shadow: 0 0 0 2px rgba(0, 123, 255, 0.25);
        }}
        
        /* 文本输入框样式 */
        input[type="text"] {{
            min-width: 200px;
            max-width: 100%;
        }}
        
        /* 数字输入框样式 */
        input[type="number"] {{
            width: 120px;
        }}
        
        /* 日期选择器样式 */
        input[type="date"] {{
            width: 150px;
        }}
        
        /* 下拉选择框样式 */
        select {{
            min-width: 150px;
            max-width: 100%;
        }}
        
        /* 文本区域样式 */
        textarea {{
            min-height: 100px;
            resize: vertical;
        }}
        
        /* 复选框样式 */
        input[type="checkbox"] {{
            width: 16px;
            height: 16px;
            margin: 4px 8px;
        }}
        
        /* 单选框样式 */
        input[type="radio"] {{
            width: 16px;
            height: 16px;
            margin: 4px 8px;
        }}
        
        /* 表单标签样式 */
        label {{
            display: block;
            margin: 8px 0 4px;
            font-weight: 500;
            color: #333;
        }}
        
        /* 表单组样式 */
        .form-group {{
            margin-bottom: 16px;
        }}
        
        /* 表单行样式 */
        .form-row {{
            display: flex;
            align-items: center;
            margin-bottom: 12px;
            gap: 16px;
        }}
        
        /* 表单列样式 */
        .form-col {{
            flex: 1;
        }}
        
        /* 签名区域样式 */
        .signature-area {{
            margin-top: 32px;
            padding-top: 16px;
            border-top: 1px solid #eee;
        }}
        
        .signature-row {{
            display: flex;
            justify-content: space-between;
            margin-bottom: 16px;
        }}
        
        .signature-item {{
            flex: 1;
            margin: 0 8px;
        }}
        
        /* 按钮样式 */
        .button-group {{
            margin-top: 32px;
            padding: 16px;
            background-color: #f8f9fa;
            border-radius: 4px;
            display: flex;
            justify-content: center;
            gap: 16px;
        }}
        
        .button {{
            padding: 8px 24px;
            border: none;
            border-radius: 4px;
            background: linear-gradient(135deg, #007bff, #0056b3);
            color: white;
            font-size: 14px;
            cursor: pointer;
            transition: all 0.3s ease;
        }}
        
        .button:hover {{
            transform: translateY(-1px);
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        }}
        
        /* 响应式布局 */
        @media (max-width: 768px) {{
            .form-row {{
                flex-direction: column;
            }}
            
            .signature-row {{
                flex-direction: column;
            }}
            
            .signature-item {{
                margin: 8px 0;
            }}
        }}
    </style>
</head>
<body>
{content}
</body>
</html>'''
                return content.strip()
                
            # 从HTML标签开始截取内容
            content = content[html_start:]
            
            # 移除开头的说明文字（如果有）
            content = re.sub(r'^.*?<!DOCTYPE html>', '<!DOCTYPE html>', content, flags=re.DOTALL)
            
            # 移除结尾的说明文字（如果有）
            content = re.sub(r'</html>.*?$', '</html>', content, flags=re.DOTALL)
            
            # 移除其他可能的说明文字
            content = re.sub(r'```note.*?```', '', content, flags=re.DOTALL)
            
            # 确保内容以</html>结尾
            if not content.strip().endswith('</html>'):
                content = content.strip() + '\n</html>'
            
            # 确保有基本的HTML结构和样式
            if '<head>' not in content:
                head_content = '''
<head>
    <meta charset="utf-8">
    <title>转换文档</title>
    <style>
        /* 表单样式优化 */
        input[type="text"],
        input[type="number"],
        input[type="date"],
        select,
        textarea {{
            width: 100%;
            padding: 8px 12px;
            margin: 4px 0;
            border: none;
            border-radius: 4px;
            background-color: #f5f5f5;
            font-size: 14px;
            transition: all 0.3s ease;
        }}
        
        input[type="text"]:focus,
        input[type="number"]:focus,
        input[type="date"]:focus,
        select:focus,
        textarea:focus {{
            outline: none;
            background-color: #ffffff;
            box-shadow: 0 0 0 2px rgba(0, 123, 255, 0.25);
        }}
        
        /* 文本输入框样式 */
        input[type="text"] {{
            min-width: 200px;
            max-width: 100%;
        }}
        
        /* 数字输入框样式 */
        input[type="number"] {{
            width: 120px;
        }}
        
        /* 日期选择器样式 */
        input[type="date"] {{
            width: 150px;
        }}
        
        /* 下拉选择框样式 */
        select {{
            min-width: 150px;
            max-width: 100%;
        }}
        
        /* 文本区域样式 */
        textarea {{
            min-height: 100px;
            resize: vertical;
        }}
        
        /* 复选框样式 */
        input[type="checkbox"] {{
            width: 16px;
            height: 16px;
            margin: 4px 8px;
        }}
        
        /* 单选框样式 */
        input[type="radio"] {{
            width: 16px;
            height: 16px;
            margin: 4px 8px;
        }}
        
        /* 表单标签样式 */
        label {{
            display: block;
            margin: 8px 0 4px;
            font-weight: 500;
            color: #333;
        }}
        
        /* 表单组样式 */
        .form-group {{
            margin-bottom: 16px;
        }}
        
        /* 表单行样式 */
        .form-row {{
            display: flex;
            align-items: center;
            margin-bottom: 12px;
            gap: 16px;
        }}
        
        /* 表单列样式 */
        .form-col {{
            flex: 1;
        }}
        
        /* 签名区域样式 */
        .signature-area {{
            margin-top: 32px;
            padding-top: 16px;
            border-top: 1px solid #eee;
        }}
        
        .signature-row {{
            display: flex;
            justify-content: space-between;
            margin-bottom: 16px;
        }}
        
        .signature-item {{
            flex: 1;
            margin: 0 8px;
        }}
        
        /* 按钮样式 */
        .button-group {{
            margin-top: 32px;
            padding: 16px;
            background-color: #f8f9fa;
            border-radius: 4px;
            display: flex;
            justify-content: center;
            gap: 16px;
        }}
        
        .button {{
            padding: 8px 24px;
            border: none;
            border-radius: 4px;
            background: linear-gradient(135deg, #007bff, #0056b3);
            color: white;
            font-size: 14px;
            cursor: pointer;
            transition: all 0.3s ease;
        }}
        
        .button:hover {{
            transform: translateY(-1px);
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        }}
        
        /* 响应式布局 */
        @media (max-width: 768px) {{
            .form-row {{
                flex-direction: column;
            }}
            
            .signature-row {{
                flex-direction: column;
            }}
            
            .signature-item {{
                margin: 8px 0;
            }}
        }}
    </style>
</head>'''
                content = content.replace('<html>', f'<html>\n{head_content}')
            
            if '<body>' not in content:
                content = content.replace('</head>', '</head>\n<body>')
                content = content.replace('</html>', '</body>\n</html>')
                
            # 移除多余的空行，但保留必要的换行
            content = re.sub(r'\n\s*\n', '\n', content)
            
            # 确保所有标签都正确闭合
            content = re.sub(r'<br>', '<br/>', content)
            content = re.sub(r'<img([^>]*)>', r'<img\1/>', content)
            
            return content.strip()
        except Exception as e:
            print(f"清理HTML内容时出错: {str(e)}")
            return content.strip()
        
    def read_file_with_encoding(self, file_path):
        # 检查文件是否存在
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"文件不存在: {file_path}")
            
        # 检查文件大小
        file_size = os.path.getsize(file_path)
        if file_size == 0:
            raise ValueError(f"文件为空: {file_path}")
        if file_size > 10 * 1024 * 1024:  # 10MB限制
            raise ValueError(f"文件过大（超过10MB）: {file_path}")
            
        # 尝试不同的编码方式
        encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5']
        last_error = None
        
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    content = f.read()
                    if not content.strip():
                        raise ValueError(f"文件内容为空: {file_path}")
                    return content
            except UnicodeDecodeError as e:
                last_error = e
                continue
            except Exception as e:
                raise Exception(f"读取文件时出错: {str(e)}")
                
        raise ValueError(f"无法使用支持的编码格式读取文件: {file_path}，最后错误: {str(last_error)}")
        
    def convert_txt_to_html(self, txt_file_path):
        try:
            # 读取txt文件内容
            txt_content = self.read_file_with_encoding(txt_file_path)
            
            # 构建提示词
            prompt = f"""请将以下文本转换为HTML格式，要求：
1. 内容准确性要求：
   - 严格按照原文内容生成，不得添加、删除或修改任何内容
   - 保持原文的所有段落、标点符号和格式
   - 保持原文的所有示例内容
   - 保持原文的所有字段名称和描述
   - 保持原文的所有数值和单位
   - 保持原文的所有日期格式
   - 保持原文的所有签名项和顺序
   - 保持原文的所有表格结构和内容
   - 保持原文的所有特殊字符和空格

2. 文档格式要求：
   - 标题居中显示，使用较大字号
   - 标题与正文之间保持适当间距
   - 正文段落首行缩进2个字符
   - 段落之间保持统一的行间距
   - 重要内容使用加粗或斜体突出显示
   - 列表项使用统一的缩进和样式
   - 表格标题居中显示
   - 表格内容对齐方式统一
   - 表格边框和间距统一
   - 页眉页脚居中对齐
   - 文档整体采用对称布局

3. 表单生成要求：
   - 为每个关键信息创建对应的表单字段
   - 企业名称使用文本输入框
   - 日期使用日期选择器
   - 所有数字和金额类型使用文本输入框
   - 选项使用下拉选择框或单选框
   - 多选项使用复选框
   - 长文本使用文本区域
   - 保持原文中的示例值作为默认值
   - 保持原文中的字段顺序

4. 表单样式要求：
   - 移除所有表单元素的默认边框
   - 使用CSS美化表单元素
   - 保持表单元素的间距和对齐
   - 统一表单元素的大小和样式
   - 文本输入框宽度根据内容自适应
   - 日期选择器和下拉框宽度统一
   - 表单元素添加圆角和阴影效果
   - 输入框获得焦点时有优雅的过渡效果
   - 表单元素使用柔和的颜色方案

5. 页面布局要求：
   - 严格按照原文的段落格式生成
   - 使用合适的字体和字号
   - 保持适当的行间距和段落间距
   - 确保段落对齐和缩进统一
   - 避免不必要的换行
   - 保持文本的连续性
   - 页面整体采用居中布局
   - 内容区域添加适当的内边距和外边距
   - 使用网格系统确保布局整齐

6. 内容完整性要求：
   - 确保所有原文内容都被完整转换
   - 不要遗漏任何段落或内容
   - 保持原文的层次结构
   - 确保所有表单字段都有对应的内容
   - 不要添加原文中没有的内容
   - 保持原文的所有表格和列表结构
   - 保持原文的所有标题和子标题
   - 保持原文的所有页眉和页脚内容

7. 签名部分要求：
   - 签名部分采用对称布局
   - 签名行使用表格布局确保对齐
   - 签名项之间保持等距
   - 签名项宽度统一
   - 签名项标签对齐
   - 签名项输入框大小一致
   - 签名日期格式统一
   - 签名部分整体居中显示
   - 签名区域添加优雅的分隔线
   - 保持原文中的签名项顺序和内容

8. 功能按钮要求：
   - 页面底部只保留两个按钮：
     * 预览按钮：用于预览表单内容
     * 导出PDF按钮：用于导出PDF文件
   - 不要添加其他任何功能按钮
   - 按钮样式简洁美观
   - 按钮位置固定在页面底部
   - 按钮添加悬停效果和过渡动画
   - 按钮使用渐变色背景
   - 按钮添加阴影效果
   - 按钮区域添加适当的内边距
   - 按钮之间保持合适的间距
   - 按钮区域添加背景色以区分内容

9. 页面美化要求：
   - 使用现代化的配色方案
   - 添加适当的背景色和渐变效果
   - 使用优雅的字体组合
   - 添加细腻的阴影效果
   - 使用圆角设计
   - 添加平滑的过渡动画
   - 确保页面整体视觉协调
   - 优化移动端显示效果

10. 其他要求：
    - 确保所有样式都正确应用，不要在页面上显示CSS代码
    - 使用响应式设计，确保在不同设备上都能正常显示
    - 确保所有表单元素都有合适的label标签
    - 保持页面整体布局的一致性
    - 添加适当的表单提示和帮助信息
    - 严格按照原文内容生成，不要添加任何额外说明或注释

文本内容：
{txt_content}"""

            max_retries = 3
            retry_count = 0
            
            while retry_count < max_retries:
                try:
                    print(f"正在发送API请求... (尝试 {retry_count + 1}/{max_retries})")
                    
                    # 添加请求超时设置
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=[
                            {"role": "system", "content": "你是一个专业的HTML转换助手，请严格按照要求将文本转换为HTML格式。请确保生成的HTML代码完整且有效。"},
                            {"role": "user", "content": prompt}
                        ],
                        max_tokens=4096,
                        temperature=0.7,
                        stream=False,
                        timeout=300  # 设置5分钟超时
                    )
                    
                    if not response or not response.choices:
                        raise Exception("API返回结果为空")
                        
                    html_content = response.choices[0].message.content
                    
                    if not html_content:
                        raise Exception("生成的HTML内容为空")
                    
                    # 清理HTML内容
                    html_content = self.clean_html_content(html_content)
                    
                    # 验证HTML内容是否有效
                    if not html_content.strip().startswith('<!DOCTYPE html>') and not html_content.strip().startswith('<html'):
                        raise Exception("生成的HTML内容格式不正确")
                    
                    # 生成输出HTML文件路径
                    output_path = os.path.splitext(txt_file_path)[0] + '.html'
                    
                    # 保存HTML文件
                    with open(output_path, 'w', encoding='utf-8') as f:
                        f.write(html_content)
                        
                    print(f"转换成功！HTML文件已保存至: {output_path}")
                    return output_path
                    
                except Exception as e:
                    print(f"请求发生错误: {str(e)}")
                    if retry_count < max_retries - 1:
                        wait_time = 30 * (retry_count + 1)  # 递增等待时间
                        print(f"等待{wait_time}秒后重试...")
                        time.sleep(wait_time)
                        retry_count += 1
                        continue
                    else:
                        print("达到最大重试次数，转换失败")
                        return None
            
            print("达到最大重试次数，转换失败")
            return None
                
        except Exception as e:
            print(f"处理文件 {txt_file_path} 时发生错误: {str(e)}")
            return None

def main():
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser(description='将TXT文件转换为HTML格式')
    parser.add_argument('path', nargs='?', default='.', help='要处理的文件夹路径，默认为当前目录')
    args = parser.parse_args()
    
    # 获取指定路径的绝对路径
    target_path = os.path.abspath(args.path)
    
    # 检查路径是否存在
    if not os.path.exists(target_path):
        print(f"错误：路径 '{target_path}' 不存在！")
        return
        
    # 检查路径是否为目录
    if not os.path.isdir(target_path):
        print(f"错误：'{target_path}' 不是一个目录！")
        return
    
    # 递归获取所有txt文件
    txt_files = []
    for root, dirs, files in os.walk(target_path):
        for file in files:
            if file.endswith('.txt'):
                txt_files.append(os.path.join(root, file))
    
    if not txt_files:
        print(f"在 '{target_path}' 及其子文件夹中没有找到txt文件！")
        return
        
    print(f"找到 {len(txt_files)} 个txt文件，开始处理...")
    converter = SiliconFlowConverter()
    
    # 处理每个txt文件
    for i, txt_file in enumerate(txt_files, 1):
        print(f"\n正在处理第 {i}/{len(txt_files)} 个文件: {txt_file}")
        try:
            output_path = converter.convert_txt_to_html(txt_file)
            if output_path:
                print(f"✓ 转换成功：{output_path}")
            else:
                print(f"✗ 转换失败：{txt_file}")
        except Exception as e:
            print(f"✗ 处理文件时出错：{str(e)}")
    
    print("\n所有文件处理完成！")

if __name__ == "__main__":
    main()

doc文档转换为html文档

思路一

思路二

doc to txt

txt to html

注 意：可以根据自己的需要，修改提示词，

注意：可以根据自己的需要，修改提示词，