PDF信息vlm提取excel工具

python 复制代码
# pdf_material_extractor.py
import os
import json
from pathlib import Path
import fitz  # PyMuPDF
from PIL import Image
import pandas as pd
import time
import logging
from dotenv import load_dotenv
import http.client
from urllib.parse import urlparse
import base64
from flask import Flask, request, jsonify, send_file
import threading
import webbrowser
import io
from tkinter import Tk
from tkinter.filedialog import askdirectory

# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PDFToImageConverter:
    @staticmethod
    def convert_pdf_to_pil_images(pdf_path):
        """
        将PDF文件转换为PIL图像对象列表(不保存到磁盘)
        
        Args:
            pdf_path (str): PDF文件路径
            
        Returns:
            list: PIL图像对象列表
        """
        logger.info(f"正在转换PDF为内存图像: {pdf_path}")
        
        try:
            # 打开PDF文件
            pdf_document = fitz.open(pdf_path)
            pil_images = []
            
            # 遍历每一页
            for page_num in range(len(pdf_document)):
                try:
                    # 获取页面
                    page = pdf_document[page_num]
                    
                    # 渲染页面为图像
                    mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
                    pix = page.get_pixmap(matrix=mat)
                    
                    # 直接转换为PIL图像而不保存到磁盘
                    img_data = pix.tobytes("ppm")
                    pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    pil_images.append(pil_image)
                    
                    logger.info(f"已转换PDF第 {page_num + 1} 页为内存图像")
                except Exception as page_error:
                    logger.error(f"处理PDF第 {page_num + 1} 页时出错: {str(page_error)}")
                    continue  # 继续处理其他页面
            
            pdf_document.close()
            return pil_images
        except Exception as e:
            logger.error(f"打开PDF文件 {pdf_path} 时出错: {str(e)}")
            return []

class VLMProcessor:
    def __init__(self):
        """初始化VLM处理器"""
        dotenv_path = r'E:\code\apikey\.env'
        load_dotenv(dotenv_path)
        self.api_url = os.getenv('VLM_OPENAI_API_URL')
        self.model_name = os.getenv('VLM_MODEL_NAME')
        self.api_key = os.getenv('VLM_OPENAI_API_KEY')
        
        if not all([self.api_url, self.model_name, self.api_key]):
            raise ValueError("缺少VLM API配置,请检查.env文件")
    
    def encode_pil_image(self, pil_image):
        """
        将PIL图像编码为base64
        
        Args:
            pil_image (PIL.Image): PIL图像对象
            
        Returns:
            str: base64编码的图像
        """
        # 将PIL图像保存到内存中的字节流
        buffer = io.BytesIO()
        pil_image.save(buffer, format='PNG')
        # 编码为base64
        return base64.b64encode(buffer.getvalue()).decode('utf-8')
    
    def extract_material_info(self, pil_image):
        """
        使用VLM提取材料信息
        
        Args:
            pil_image (PIL.Image): PIL图像对象
            
        Returns:
            dict: 材料信息
        """
        logger.info("正在提取材料信息")
        
        # 编码图像
        base64_image = self.encode_pil_image(pil_image)
        
        # 构造消息
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "请从这张图纸中提取零件信息,包括零件名称、材料类型、数量以及每个零件的独立文件名。如果有多个零件,请分别列出。以JSON数组格式返回,例如:[{'零件': '零件1', '材料': 'Q235B', '数量': 1, '文件名': '文件1.dwg'}, {'零件': '零件2', '材料': 'Q345B', '数量': 2, '文件名': '文件2.dwg'}]。如果无法识别,请返回[{'零件': '未知', '材料': '未知', '数量': '未知', '文件名': '未知'}]"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
        
        return self._send_request(messages)
    
    def _send_request(self, messages):
        """
        发送请求到VLM API
        
        Args:
            messages (list): 消息列表
            
        Returns:
            dict: API响应结果
        """
        # 解析URL
        parsed = urlparse(f"{self.api_url}/chat/completions")
        host, path = parsed.hostname, parsed.path
        
        # 创建连接
        conn = http.client.HTTPSConnection(host)
        
        # 构造请求体
        request_body = {
            "model": self.model_name,
            "messages": messages,
            "temperature": 0.7
        }
        
        # 发送请求
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }
        
        conn.request(
            "POST",
            path,
            body=json.dumps(request_body),
            headers=headers
        )
        
        # 获取响应
        response = conn.getresponse()
        
        if response.status != 200:
            error_msg = response.read().decode('utf-8')
            conn.close()
            raise Exception(f"VLM服务器错误: {response.status} - {error_msg}")
        
        # 解析响应
        response_data = response.read().decode('utf-8')
        data = json.loads(response_data)
        conn.close()
        
        # 提取结果
        try:
            content = data['choices'][0]['message']['content']
            # 处理可能的Markdown包装
            if content.startswith("```json"):
                content = content[7:]  # 移除 ```json
                if content.endswith("```"):
                    content = content[:-3]  # 移除 ```
            
            result = json.loads(content)
            logger.info(f"VLM处理结果: {result}")
            return result
        except Exception as e:
            logger.error(f"解析VLM响应失败: {e}")
            logger.error(f"原始响应: {content}")
            raise

class ExcelUpdater:
    @staticmethod
    def update_excel(excel_path, file_name, material_info_list):
        """
        更新Excel文件中的信息
        
        Args:
            excel_path (str): Excel文件路径
            file_name (str): PDF文件名(不含扩展名)
            material_info_list (list): 材料信息列表
        """
        logger.info(f"正在更新Excel: {excel_path}")
        
        # 确保 material_info_list 是一个列表
        if not isinstance(material_info_list, list):
            material_info_list = [material_info_list]
        
        # 读取Excel文件
        if os.path.exists(excel_path):
            df = pd.read_excel(excel_path)
            logger.info(f"成功读取Excel文件,现有 {len(df)} 行数据")
            
            # 确保必要的列存在
            required_columns = ['PDF文件名', '零件', '零件文件名', '材料', '数量']
            for col in required_columns:
                if col not in df.columns:
                    df[col] = ""  # 添加缺失的列
        else:
            # 如果文件不存在,创建新的DataFrame,添加'零件文件名'列
            df = pd.DataFrame(columns=['PDF文件名', '零件', '零件文件名', '材料', '数量'])
            logger.info("Excel文件不存在,创建新的DataFrame")
        
        # 安全地删除现有的与此文件相关的记录
        if not df.empty and 'PDF文件名' in df.columns:
            df = df[df['PDF文件名'] != file_name]
        else:
            # 如果DataFrame为空或没有'PDF文件名'列,则从空开始
            df = pd.DataFrame(columns=['PDF文件名', '零件', '零件文件名', '材料', '数量'])
        
        # 为每个零件添加新行
        new_rows = []
        for material_info in material_info_list:
            part_name = material_info.get('零件', '未知')
            part_filename = material_info.get('文件名', '未知')  # 提取零件文件名
            material = material_info.get('材料', '未知')
            quantity = material_info.get('数量', '未知')
            
            new_rows.append({
                'PDF文件名': file_name,
                '零件': part_name,
                '零件文件名': part_filename,  # 添加零件文件名列
                '材料': material,
                '数量': quantity
            })
        
        # 添加新行到DataFrame
        if new_rows:
            new_df = pd.DataFrame(new_rows)
            df = pd.concat([df, new_df], ignore_index=True)
        
        logger.info(f"添加了 {len(new_rows)} 个零件信息")
        
        # 保存Excel文件
        try:
            df.to_excel(excel_path, index=False)
            logger.info(f"已成功更新Excel文件,保存路径: {excel_path}")
        except Exception as e:
            logger.error(f"保存Excel文件失败: {str(e)}")
            raise
        logger.info("已更新Excel文件")

# 全局变量存储当前状态
current_state = {
    'pdf_files': [],
    'current_index': 0,
    'current_pdf': None,
    'current_image': None,
    'current_pil_image': None,
    'processing': False,
    'converter': None,
    'processor': None,
    'updater': None,
    'pdf_directory': '',
    'excel_path': '',
    'image_paths': {},
    'pil_images_cache': {},
    'processing_complete': False
}

app = Flask(__name__)

@app.route('/')
def index():
    """Web界面主页"""
    return '''
    <!DOCTYPE html>
    <html>
    <head>
        <title>PDF材料信息提取工具</title>
        <meta charset="utf-8">
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
            .container { max-width: 1200px; margin: 0 auto; background-color: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
            button { padding: 12px 24px; margin: 10px; font-size: 16px; border: none; border-radius: 4px; cursor: pointer; transition: background-color 0.3s; }
            .btn-primary { background-color: #17a2b8; color: white; }
            .btn-primary:hover { background-color: #138496; }
            .btn-start { background-color: #4CAF50; color: white; }
            .btn-start:hover { background-color: #45a049; }
            .btn-reset { background-color: #607D8B; color: white; }
            .btn-reset:hover { background-color: #455A64; }
            button:disabled { background-color: #cccccc; cursor: not-allowed; }
            #image-container { margin-top: 20px; text-align: center; min-height: 300px; display: flex; align-items: center; justify-content: center; }
            img { max-width: 100%; max-height: 70vh; border: 1px solid #ddd; border-radius: 4px; box-shadow: 0 2px 6px rgba(0,0,0,0.1); }
            .status { margin: 20px 0; padding: 15px; background-color: #e3f2fd; border-radius: 4px; }
            .controls { margin: 20px 0; text-align: center; }
            h1 { color: #333; text-align: center; }
            .progress-bar { height: 20px; background-color: #e0e0e0; border-radius: 10px; overflow: hidden; margin: 10px 0; }
            .progress-fill { height: 100%; background-color: #4CAF50; transition: width 0.3s; }
            .material-result { 
                margin: 15px 0; 
                padding: 10px; 
                background-color: #e8f5e9; 
                border: 1px solid #c8e6c9; 
                border-radius: 4px; 
                font-weight: bold;
                font-size: 16px;
            }
            .path-input {
                width: 70%;
                padding: 8px;
                margin-right: 10px;
                border: 1px solid #ccc;
                border-radius: 4px;
            }
            .browse-btn { padding: 8px 16px; background-color: #6c757d; color: white; border: none; border-radius: 4px; cursor: pointer; }
            .browse-btn:hover { background-color: #5a6268; }
            .processing-status {
                margin: 15px 0;
                padding: 10px;
                background-color: #e3f2fd;
                border-radius: 4px;
                text-align: center;
                font-weight: bold;
            }
            .part-item {
                margin: 8px 0;
                padding: 8px;
                background-color: #f1f8e9;
                border-left: 3px solid #4CAF50;
            }
        </style>
    </head>
    <body>
        <div class="container">
            <h1>PDF材料信息提取工具</h1>
            
            <div class="status">
                <div style="margin-bottom: 15px;">
                    <label for="pdf-directory">PDF目录:</label>
                    <input type="text" id="pdf-directory" class="path-input" placeholder="请选择PDF目录">
                    <button class="browse-btn" onclick="browseDirectory()">浏览</button>
                </div>
                
                <button class="btn-primary" onclick="setPaths()" id="set-paths-btn">设置路径并开始处理</button>
                <button class="btn-reset" onclick="resetProcessing()" id="reset-btn">重置</button>
                
                <hr style="margin: 20px 0;">
                
                <p>当前文件: <strong><span id="current-file">无</span></strong></p>
                <p>进度: <span id="progress">0/0</span></p>
                <div class="progress-bar">
                    <div class="progress-fill" id="progress-bar-fill" style="width: 0%"></div>
                </div>
            </div>
            
            <div class="processing-status" id="processing-status" style="display: none;">
                <span id="processing-text">正在处理中...</span>
            </div>
            
            <div class="material-result" id="material-result" style="display: none;">
                <div>PDF文件名: <strong><span id="pdf-file-name"></span></strong></div>
                <div id="parts-container"></div>
            </div>
            
            <div id="image-container">
                <p id="no-image-msg">尚未开始处理</p>
                <img id="preview-image" src="" alt="PDF预览" style="display:none;">
            </div>
        </div>
        
        <script>
            function browseDirectory() {
                fetch('/browse_directory', {method: 'POST'})
                    .then(response => response.json())
                    .then(data => {
                        if (data.directory) {
                            document.getElementById('pdf-directory').value = data.directory;
                        }
                    })
                    .catch(error => {
                        console.error('Error:', error);
                    });
            }
            
            function setPaths() {
                const pdfDirectory = document.getElementById('pdf-directory').value;
                
                if (!pdfDirectory) {
                    alert('请先选择PDF目录');
                    return;
                }
                
                fetch('/set_paths', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json'
                    },
                    body: JSON.stringify({
                        pdf_directory: pdfDirectory
                    })
                })
                .then(response => response.json())
                .then(data => {
                    if (data.error) {
                        alert('错误: ' + data.error);
                    } else {
                        // 开始处理
                        startProcessing();
                    }
                })
                .catch(error => {
                    alert('设置路径时出错: ' + error.message);
                });
            }
            
            function startProcessing() {
                // 显示处理状态
                document.getElementById('processing-status').style.display = 'block';
                document.getElementById('set-paths-btn').disabled = true;
                
                fetch('/start_processing', {method: 'POST'})
                    .then(response => response.json())
                    .then(data => {
                        if (data.error) {
                            alert('错误: ' + data.error);
                            document.getElementById('processing-status').style.display = 'none';
                            document.getElementById('set-paths-btn').disabled = false;
                            return;
                        }
                        
                        updateStatus(data);
                        if (data.current_file) {
                            loadImage();
                        } else {
                            // 处理完成
                            document.getElementById('processing-status').style.display = 'none';
                            document.getElementById('set-paths-btn').disabled = false;
                            document.getElementById('processing-text').textContent = '处理完成!';
                        }
                    })
                    .catch(error => {
                        alert('处理过程中出现错误: ' + error.message);
                        document.getElementById('processing-status').style.display = 'none';
                        document.getElementById('set-paths-btn').disabled = false;
                    });
            }
            
            function resetProcessing() {
                fetch('/reset_processing', {method: 'POST'})
                    .then(response => response.json())
                    .then(data => {
                        updateStatus(data);
                        hideImage();
                        hideMaterialInfo();
                        document.getElementById('processing-status').style.display = 'none';
                        document.getElementById('set-paths-btn').disabled = false;
                        
                        // 清空路径输入框
                        document.getElementById('pdf-directory').value = '';
                    });
            }
            
            function loadImage() {
                const timestamp = new Date().getTime(); // 防止缓存
                const img = document.getElementById('preview-image');
                const msg = document.getElementById('no-image-msg');
                if (img && msg) {
                    img.src = '/image/' + timestamp;
                    img.style.display = 'inline';
                    msg.style.display = 'none';
                }
                
                // 继续处理下一个文件
                setTimeout(() => {
                    fetch('/process_next', {method: 'POST'})
                        .then(response => response.json())
                        .then(data => {
                            if (data.error) {
                                alert('错误: ' + data.error);
                                document.getElementById('processing-status').style.display = 'none';
                                document.getElementById('set-paths-btn').disabled = false;
                                return;
                            }
                            
                            updateStatus(data);
                            if (data.current_file) {
                                loadImage();
                            } else {
                                // 处理完成
                                document.getElementById('processing-status').style.display = 'none';
                                document.getElementById('set-paths-btn').disabled = false;
                                document.getElementById('processing-text').textContent = '处理完成!';
                            }
                        })
                        .catch(error => {
                            alert('处理过程中出现错误: ' + error.message);
                            document.getElementById('processing-status').style.display = 'none';
                            document.getElementById('set-paths-btn').disabled = false;
                        });
                }, 1000); // 1秒后处理下一个文件
            }
            
            function hideImage() {
                const img = document.getElementById('preview-image');
                const msg = document.getElementById('no-image-msg');
                if (img && msg) {
                    img.style.display = 'none';
                    msg.style.display = 'block';
                    msg.textContent = '处理已完成';
                }
            }
            
            function updateStatus(data) {
                const currentFileEl = document.getElementById('current-file');
                const progressEl = document.getElementById('progress');
                const progressBarFill = document.getElementById('progress-bar-fill');
                
                if (currentFileEl) {
                    currentFileEl.textContent = data.current_file || '无';
                }
                if (progressEl) {
                    progressEl.textContent = data.progress || '0/0';
                }
                
                if (progressBarFill && data.total > 0) {
                    const percentage = (data.processed / data.total) * 100;
                    progressBarFill.style.width = percentage + '%';
                } else if (progressBarFill) {
                    progressBarFill.style.width = '0%';
                }
                
                // 显示材料信息
                if (data.material_info) {
                    showMaterialInfo(data.material_info, data.file_name);
                } else {
                    hideMaterialInfo();
                }
            }
            
            function showMaterialInfo(materialInfo, fileName) {
                const resultEl = document.getElementById('material-result');
                const fileNameEl = document.getElementById('pdf-file-name');
                const partsContainer = document.getElementById('parts-container');
                
                if (resultEl && fileNameEl && partsContainer && (materialInfo || fileName)) {
                    // 显示PDF文件名
                    if (fileName) {
                        fileNameEl.textContent = fileName;
                    }
                    
                    // 清空零件容器
                    partsContainer.innerHTML = '';
                    
                    if (Array.isArray(materialInfo) && materialInfo.length > 0) {
                        // 处理多个零件的情况
                        materialInfo.forEach((item, index) => {
                            const partDiv = document.createElement('div');
                            partDiv.className = 'part-item';
                            
                            let partText = `零件${index+1}: `;
                            if (item.零件) partText += `${item.零件} `;
                            if (item.文件名) partText += `文件名: ${item.文件名} `;
                            if (item.材料) partText += `材料: ${item.材料} `;
                            if (item.数量 !== undefined) partText += `数量: ${item.数量}`;
                            
                            partDiv.textContent = partText;
                            partsContainer.appendChild(partDiv);
                        });
                    } else if (materialInfo) {
                        // 处理单个零件的情况
                        const partDiv = document.createElement('div');
                        partDiv.className = 'part-item';
                        
                        let partText = '零件1: ';
                        if (materialInfo.零件) partText += `${materialInfo.零件} `;
                        if (materialInfo.文件名) partText += `文件名: ${materialInfo.文件名} `;
                        if (materialInfo.材料) partText += `材料: ${materialInfo.材料} `;
                        if (materialInfo.数量 !== undefined) partText += `数量: ${materialInfo.数量}`;
                        
                        partDiv.textContent = partText;
                        partsContainer.appendChild(partDiv);
                    }
                    
                    resultEl.style.display = 'block';
                } else if (resultEl) {
                    resultEl.style.display = 'none';
                }
            }
            
            function hideMaterialInfo() {
                const resultEl = document.getElementById('material-result');
                if (resultEl) {
                    resultEl.style.display = 'none';
                }
            }
        </script>
    </body>
    </html>
    '''

# 添加一个新的路由来提供图片,使用时间戳作为参数
@app.route('/image/<timestamp>')
def serve_image(timestamp):
    """提供当前图像文件"""
    logger.info(f"请求图片服务: current_pdf={current_state['current_pdf']}")
    
    if current_state['current_pdf'] and current_state['current_pil_image']:
        # 将PIL图像对象转换为响应
        img_io = io.BytesIO()
        current_state['current_pil_image'].save(img_io, 'PNG')
        img_io.seek(0)
        
        return send_file(img_io, mimetype='image/png')
    
    logger.warning("没有当前图片可提供")
    return '', 404

@app.route('/browse_directory', methods=['POST'])
def browse_directory():
    """打开目录选择对话框"""
    try:
        # 隐藏Flask服务器窗口
        root = Tk()
        root.withdraw()
        root.attributes('-topmost', True)
        
        directory = askdirectory(title="选择PDF文件所在目录")
        root.destroy()
        
        if directory:
            return jsonify({'directory': directory})
        else:
            return jsonify({'directory': ''})
    except Exception as e:
        logger.error(f"选择目录时出错: {str(e)}")
        return jsonify({'error': '选择目录时出错'})

# 修改 start_processing 函数中的这部分代码
@app.route('/set_paths', methods=['POST'])
def set_paths():
    """设置PDF目录和Excel文件路径"""
    global current_state
    
    data = request.get_json()
    pdf_directory = data.get('pdf_directory')
    
    if not pdf_directory:
        return jsonify({'error': '请提供PDF目录'})
    
    if not os.path.exists(pdf_directory):
        return jsonify({'error': '指定的PDF目录不存在'})
        
    # 设置固定的Excel输出路径
    excel_output_dir = os.path.join('excel_output', 'pdf_vlm_excel')
    os.makedirs(excel_output_dir, exist_ok=True)  # 自动创建目录(如果不存在)
    
    # 使用固定名称的Excel文件
    excel_path = os.path.join(excel_output_dir, 'pdf_material_info.xlsx')
    
    current_state['pdf_directory'] = pdf_directory
    current_state['excel_path'] = excel_path
    
    logger.info(f"设置路径完成 - PDF目录: {pdf_directory}")
    logger.info(f"Excel文件将保存至: {excel_path}")
    
    return jsonify({
        'message': '路径设置成功',
        'pdf_directory': pdf_directory,
        'excel_path': excel_path
    })

@app.route('/start_processing', methods=['POST'])
def start_processing():
    """开始处理PDF文件"""
    global current_state
    
    # 检查是否已设置路径
    if not current_state['pdf_directory'] or not current_state['excel_path']:
        return jsonify({'error': '请先设置PDF目录和Excel文件路径'})
    
    logger.info(f"开始处理,PDF目录: {current_state['pdf_directory']}")
    logger.info(f"Excel文件路径: {current_state['excel_path']}")
    
    # 清空之前的缓存
    current_state['pil_images_cache'] = {}
    current_state['image_paths'] = {}
    
    # 初始化组件
    current_state['converter'] = PDFToImageConverter()
    current_state['processor'] = VLMProcessor()
    current_state['updater'] = ExcelUpdater()
    
    # 获取所有PDF文件
    if not os.path.exists(current_state['pdf_directory']):
        logger.error(f"PDF目录不存在: {current_state['pdf_directory']}")
        return jsonify({'error': f'PDF目录不存在: {current_state["pdf_directory"]}'})
        
    current_state['pdf_files'] = [f for f in os.listdir(current_state['pdf_directory']) if f.lower().endswith('.pdf')]
    current_state['current_index'] = 0
    
    logger.info(f"找到 {len(current_state['pdf_files'])} 个PDF文件")
    for i, pdf_file in enumerate(current_state['pdf_files']):
        logger.info(f"  {i+1}. {pdf_file}")
    
    if not current_state['pdf_files']:
        return jsonify({'error': '目录中没有找到PDF文件'})
    
    # 处理第一个文件
    return process_next_file()

@app.route('/process_next', methods=['POST'])
def process_next():
    """处理下一个文件"""
    global current_state
    
    # 移动到下一个文件
    current_state['current_index'] += 1
    return process_next_file()

@app.route('/reset_processing', methods=['POST'])
def reset_processing():
    """重置处理状态"""
    global current_state
    
    logger.info("重置处理状态")
    
    current_state['pdf_files'] = []
    current_state['current_index'] = 0
    current_state['current_pdf'] = None
    current_state['current_image'] = None
    current_state['current_pil_image'] = None
    current_state['processing'] = False
    current_state['image_paths'] = {}
    current_state['pil_images_cache'] = {}
    current_state['processing_complete'] = False
    
    return jsonify({
        'current_file': None,
        'progress': '0/0',
        'processed': 0,
        'total': 0
    })

def process_next_file():
    """处理下一个PDF文件"""
    global current_state
    
    while current_state['current_index'] < len(current_state['pdf_files']):
        # 获取当前PDF文件
        current_state['current_pdf'] = current_state['pdf_files'][current_state['current_index']]
        pdf_path = os.path.join(current_state['pdf_directory'], current_state['current_pdf'])
        
        logger.info(f"开始处理PDF文件 ({current_state['current_index']+1}/{len(current_state['pdf_files'])}): {pdf_path}")
        
        try:
            # 检查是否已经转换过该PDF
            pdf_name = Path(current_state['current_pdf']).stem
            if pdf_name in current_state['pil_images_cache']:
                # 如果已经转换过,直接使用已有的图像对象
                pil_images = current_state['pil_images_cache'][pdf_name]
                logger.info(f"使用已缓存的图像对象,共 {len(pil_images)} 页")
            else:
                # 转换PDF为图像对象
                pil_images = current_state['converter'].convert_pdf_to_pil_images(pdf_path)
                
                # 缓存图像对象
                current_state['pil_images_cache'][pdf_name] = pil_images
                logger.info(f"新生成图像对象并缓存,共 {len(pil_images)} 页")
            
            material_info_list = None
            
            # 使用第一页作为预览
            if pil_images and len(pil_images) > 0:
                current_state['current_pil_image'] = pil_images[0]
                logger.info("设置当前PIL图像对象用于预览")
                
                # 提取材料信息
                try:
                    material_result = current_state['processor'].extract_material_info(current_state['current_pil_image'])
                    logger.info(f"材料信息提取结果: {material_result}")
                    
                    # 确保结果是列表格式
                    if isinstance(material_result, dict):
                        material_info_list = [material_result]
                    elif isinstance(material_result, list):
                        material_info_list = material_result
                    else:
                        material_info_list = [{"零件": "未知", "材料": "提取失败", "数量": "提取失败", "文件名": "提取失败"}]
                        
                except Exception as e:
                    logger.error(f"材料信息提取失败: {str(e)}")
                    material_info_list = [{"零件": "未知", "材料": "提取失败", "数量": "提取失败", "文件名": "提取失败"}]
                
                # 更新Excel
                try:
                    current_state['updater'].update_excel(
                        current_state['excel_path'], 
                        pdf_name, 
                        material_info_list
                    )
                    logger.info("Excel更新完成")
                except Exception as e:
                    logger.error(f"更新Excel时出错: {str(e)}")
            else:
                current_state['current_pil_image'] = None
                logger.warning(f"未生成图像对象 for {current_state['current_pdf']}")
            
            # 返回完整的零件列表用于前端显示
            return jsonify({
                'current_file': current_state['current_pdf'],
                'progress': f'{current_state["current_index"] + 1}/{len(current_state["pdf_files"])}',
                'processed': current_state["current_index"] + 1,
                'total': len(current_state["pdf_files"]),
                'material_info': material_info_list,  # 返回完整列表
                'file_name': pdf_name
            })
        except Exception as e:
            logger.error(f"处理文件 {current_state['current_pdf']} 时出错: {str(e)}")
            # 即使出错也继续处理下一个文件
            current_state['current_index'] += 1
    
    # 已经处理完所有文件
    logger.info("所有文件处理完成")
    current_state['current_pdf'] = None
    current_state['current_image'] = None
    current_state['current_pil_image'] = None
    current_state['processing_complete'] = True
    return jsonify({
        'current_file': None,
        'progress': f'{len(current_state["pdf_files"])}/{len(current_state["pdf_files"])}',
        'processed': len(current_state["pdf_files"]),
        'total': len(current_state["pdf_files"]),
        'message': '所有文件处理完成'
    })

def run_web_interface():
    """运行Web界面"""
    app.run(host='127.0.0.1', port=5000, debug=False, use_reloader=False)

def open_browser():
    """打开浏览器"""
    webbrowser.open_new('http://127.0.0.1:5000/')

if __name__ == "__main__":
    print("启动PDF材料信息提取工具...")
    
    # 启动Web服务线程
    web_thread = threading.Thread(target=run_web_interface)
    web_thread.daemon = True
    web_thread.start()
    
    # 等待一秒确保服务器启动后再打开浏览器
    time.sleep(1)
    open_browser()
    
    print("Web界面已在 http://127.0.0.1:5000 启动")
    print("请在浏览器中查看界面")
    print("按 Ctrl+C 退出程序")
    
    # 主线程继续运行
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        print("\n程序已退出")
相关推荐
星空的资源小屋4 小时前
MkFont,一款开源免费的字体设计工具
网络·人工智能·pdf·电脑
开开心心就好10 小时前
Word转PDF工具,免费生成图片型文档
前端·网络·笔记·pdf·word·powerpoint·excel
寄思~13 小时前
python批量读取word表格写入excel固定位置
开发语言·python·excel
演绎平生1 天前
=word插入公式后行距变大怎么办?-笔记
pdf
2501_929157681 天前
Switch 20.5.0系统最新PSP模拟器懒人包
android·游戏·ios·pdf
zstar-_1 天前
FreeP2W:一个PDF转Word的CLI工具
pdf·word
njsgcs1 天前
读取文件夹内的pdf装换成npg给vlm分类人工确认然后填入excel vlmapi速度挺快 qwen3-vl-plus webbrowser.open
分类·pdf·excel
星空的资源小屋1 天前
Tuesday JS,一款可视化小说编辑器
运维·网络·人工智能·编辑器·电脑·excel
sinat_375112261 天前
abap 通用发送邮件程序(获取alv数据,带excel附件)
excel·sap·abap·邮件