文档解析辅助工具编码实现手册
摘要
本手册提供了一套完整的Python实现方案,用于将自然语言设计文档转换为结构化数据。工具采用模块化设计,包含以下核心组件:
- 核心模块:
- 输入处理:支持TXT/DOCX/MD格式解析,自动过滤无效内容
- 解析引擎:实现拼块拆分、层级标注、九宫格绑定、语义深化和符号映射
- 校验模块:严格遵循原文档规则进行结果校验
- 输出处理:支持JSON/Excel/CSV/MySQL多格式导出
- 技术实现:
- 基于Python 3.9+开发
- 使用python-docx/markdown处理文档输入
- PyQt5实现GUI界面
- 单元测试覆盖所有核心功能
- 支持一键打包为可执行文件
- 关键特性:
- 严格对齐原文档规则体系
- 分层解耦的模块化设计
- 可视化操作界面
- 完整的日志记录
- 1-2周可完成开发部署
手册提供了每个模块的完整伪代码和实现代码,包括:
- 配置文件管理
- 文档解析流水线
- 校验规则实现
- 多格式导出处理
- GUI界面开发
- 单元测试用例
开发人员可基于本手册快速实现文档解析工具,解决设计文档到程序可识别数据的转换问题。
Q&A
Q8
请帮我生成该工具的详细编码实现手册,包括每个子模块的完整伪代码、核心函数代码、GUI 开发代码、单元测试代码,直接供开发人员编码实现
文档解析辅助工具 详细编码实现手册
工具定位 :程序读档专属翻译器,实现自然语言设计文档→程序可识别的结构化数据转换开发语言 :Python 3.9+核心技术栈 :Python-docx、python-markdown、pymysql、openpyxl、PyQt5、logging开发目标:1-2 周内完成轻量可运行工具,支持文档解析、规则校验、多格式导出
目录
- 环境搭建与项目结构
- 各子模块完整实现代码(伪代码 + 核心函数)
- GUI 开发完整代码
- 单元测试完整代码
- 打包与部署说明
1. 环境搭建与项目结构
1.1 环境搭建(一键安装依赖)
bash
bash
# 新建虚拟环境(可选)
python -m venv doc_parser_env
# 激活环境(Windows)
doc_parser_env\Scripts\activate
# 激活环境(Mac/Linux)
source doc_parser_env/bin/activate
# 安装核心依赖
pip install python-docx python-markdown pymysql openpyxl pyqt5 matplotlib numpy
1.2 项目结构(规范易维护)
plaintext
css
doc_parser_tool/
├── core/ # 核心解析模块
│ ├── __init__.py
│ ├── input_handler.py # 输入层:文档解析+归一化
│ ├── parser_core.py # 核心解析层:拆分+标注+绑定+映射
│ ├── output_handler.py # 输出层:结构化+导出
│ └── validator.py # 校验模块
├── gui/ # GUI模块
│ ├── __init__.py
│ └── main_window.py # 主界面
├── config/ # 配置文件
│ ├── __init__.py
│ ├── symbol_mapping.py # 文字-符号映射配置
│ └── db_config.py # 数据库配置
├── tests/ # 单元测试
│ ├── __init__.py
│ ├── test_input.py
│ ├── test_parser.py
│ └── test_output.py
├── logs/ # 日志目录
├── output/ # 输出文件目录
├── main.py # 工具入口
└── README.md # 使用手册
2. 各子模块完整实现代码
2.1 配置文件模块
2.1.1 符号映射配置(config/symbol_mapping.py)
python
python
# 文字→符号枚举定义
from enum import Enum
class SymbolEnum(Enum):
"""核心术语/规则/操作符号枚举(一对一映射原文档)"""
# 三身体系
SYMBOL_BODY_LAW = "法身"
SYMBOL_BODY_KARMA = "报身"
SYMBOL_BODY_RESPONSE = "应身"
# 三套九宫格
SYMBOL_GRID_CENTER = "中-地位表"
SYMBOL_GRID_OUTER = "外-角色表"
SYMBOL_GRID_INNER = "内-身份表"
# 核心操作
SYMBOL_OP_REG = "正则式求是"
SYMBOL_OP_POLY = "多项式求解"
SYMBOL_OP_DET = "行列式求值"
# 8个三体系
SYMBOL_83_GANG = "总纲"
SYMBOL_83_MU = "总目"
SYMBOL_83_KE = "总科"
SYMBOL_83_SHU = "总属"
# 太乙/道/名
SYMBOL_TAIYI = "太乙"
SYMBOL_DAO = "道"
SYMBOL_MING = "名"
# 文字→符号映射字典(反向查询)
TEXT_TO_SYMBOL = {v.value: v.name for v in SymbolEnum}
# 符号→数字基础映射(按原文档数制规则)
SYMBOL_TO_BASE_NUM = {
"SYMBOL_BODY_LAW": 10,
"SYMBOL_BODY_KARMA": 20,
"SYMBOL_BODY_RESPONSE": 30,
"SYMBOL_GRID_CENTER": 1,
"SYMBOL_GRID_OUTER": 2,
"SYMBOL_GRID_INNER": 3,
"SYMBOL_OP_REG": 100,
"SYMBOL_OP_POLY": 200,
"SYMBOL_OP_DET": 300,
"SYMBOL_83_GANG": 10,
"SYMBOL_83_MU": 20,
"SYMBOL_83_KE": 30,
"SYMBOL_83_SHU": 40,
"SYMBOL_TAIYI": 0,
"SYMBOL_DAO": 1,
"SYMBOL_MING": 2
}
# 九宫格规则配置
GRID_RULES = {
"col_rule": {"a": "角", "b": "边", "c": "料"},
"row_rule": {"1": ["总纲", "总目"], "2": ["总科"], "3": ["总属"]},
"grid_type": {
"中": {"num_system": 2, "quant_dim": "色度", "font": "宋体", "call_type": "延时by-times"},
"外": {"num_system": 10, "quant_dim": "亮度", "font": "斜体", "call_type": "动态through-time"},
"内": {"num_system": 3, "quant_dim": "明度", "font": "加粗", "call_type": "即时at-a-time"}
}
}
2.1.2 数据库配置(config/db_config.py)
python
python
# 数据库默认配置(可由GUI动态修改)
DB_CONFIG = {
"host": "localhost",
"user": "root",
"password": "123456",
"db": "doc_parser_db",
"charset": "utf8mb4"
}
# 核心库表建表语句(三套九宫格表为例)
CREATE_TABLE_SQLS = {
"core_three_grid": """
CREATE TABLE IF NOT EXISTS core_three_grid (
grid_id VARCHAR(32) PRIMARY KEY COMMENT '九宫格唯一ID',
grid_name VARCHAR(32) NOT NULL COMMENT '九宫格名称',
piece_id VARCHAR(32) NOT NULL COMMENT '拼块ID',
piece_content TEXT NOT NULL COMMENT '拼块内容',
symbol_name VARCHAR(64) COMMENT '符号名称',
number_value INT COMMENT '数字值',
num_system INT COMMENT '数制',
quant_dim VARCHAR(16) COMMENT '量化维度',
font VARCHAR(16) COMMENT '字体规则',
call_type VARCHAR(32) COMMENT '调用规则',
create_time DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='三套九宫格核心表';
""",
"text_symbol_mapping": """
CREATE TABLE IF NOT EXISTS text_symbol_mapping (
id INT AUTO_INCREMENT PRIMARY KEY,
text_content VARCHAR(128) NOT NULL COMMENT '文字内容',
symbol_name VARCHAR(64) NOT NULL COMMENT '符号名称',
base_num INT COMMENT '基础数字',
create_time DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
UNIQUE KEY uk_text (text_content)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='文字-符号映射表';
"""
}
2.2 输入层模块(core/input_handler.py)
python
python
"""输入层:多格式文档解析+归一化+无效内容过滤"""
import re
import os
from docx import Document
import markdown
class InputHandler:
def __init__(self):
# 无效内容过滤正则
self.invalid_patterns = [
r"页眉|页脚", # 页眉页脚
r"^\s*$", # 空行
r"注释:|备注:", # 注释行
r"作者:|日期:" # 作者/日期行
]
self.invalid_re = re.compile("|".join(self.invalid_patterns))
def parse_document(self, file_path):
"""
解析多格式文档,提取纯文字内容
:param file_path: 文档路径
:return: 原始文字内容
"""
file_ext = os.path.splitext(file_path)[1].lower()
try:
if file_ext == ".txt":
return self._parse_txt(file_path)
elif file_ext == ".docx":
return self._parse_docx(file_path)
elif file_ext == ".md":
return self._parse_md(file_path)
else:
raise ValueError(f"不支持的文件格式:{file_ext}")
except Exception as e:
raise RuntimeError(f"解析文档失败:{str(e)}")
def normalize_content(self, raw_content):
"""
内容归一化:过滤无效内容+按段落拆分
:param raw_content: 原始文字内容
:return: 归一化后的内容列表(每行一个逻辑单元)
"""
# 按行拆分
lines = raw_content.split("\n")
# 过滤无效内容
normalized_lines = []
for line in lines:
line = line.strip()
if not self.invalid_re.search(line) and line:
normalized_lines.append(line)
return normalized_lines
def _parse_txt(self, file_path):
"""解析TXT文档"""
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def _parse_docx(self, file_path):
"""解析DOCX文档"""
doc = Document(file_path)
content = []
for para in doc.paragraphs:
content.append(para.text)
return "\n".join(content)
def _parse_md(self, file_path):
"""解析MD文档(提取纯文字,剔除MD标记)"""
with open(file_path, "r", encoding="utf-8") as f:
md_content = f.read()
# 转换为HTML后提取纯文字
html_content = markdown.markdown(md_content)
# 剔除HTML标签
html_re = re.compile("<.*?>")
pure_text = html_re.sub("", html_content)
return pure_text
# 测试代码
if __name__ == "__main__":
handler = InputHandler()
# 测试TXT解析
test_txt = "test.txt"
with open(test_txt, "w", encoding="utf-8") as f:
f.write("总纲:三套九宫格\n页脚:2026\n法身:本然清洁\n")
content = handler.parse_document(test_txt)
normalized = handler.normalize_content(content)
print("归一化结果:", normalized)
os.remove(test_txt)
2.3 核心解析层模块(core/parser_core.py)
python
python
"""核心解析层:拼块拆分+层级标注+九宫格绑定+语义深化+映射"""
import uuid
import numpy as np
from config.symbol_mapping import TEXT_TO_SYMBOL, SYMBOL_TO_BASE_NUM, GRID_RULES
class ParserCore:
def __init__(self):
# 拼块类型关键词(拼语言:料/边/角)
self.piece_type_keywords = {
"料": ["总纲", "内容表", "现象", "认知能熵", "水", "文字般若"],
"边": ["总目", "格式表", "迹象", "认知残余", "氧气", "观照般若"],
"角": ["总科", "样式表", "角色", "认知内核", "血液", "实相般若"]
}
# 8个三体系关键词
self.level_keywords = {
"总纲": ["顶", "大纲", "方面"],
"总目": ["维度", "abc", "目录", "分类"],
"总科": ["层次", "123", "段落", "分词"],
"总属": ["底", "标题", "三位一体"]
}
def split_pieces(self, normalized_lines):
"""
拆分文字拼块(最小单元)
:param normalized_lines: 归一化内容列表
:return: 拼块列表 [{"piece_id": "", "content": "", "type": "", "line": ""}, ...]
"""
pieces = []
for line in normalized_lines:
# 按标点拆分最小单元(。;,、)
units = re.split(r"[。;,、]", line)
for unit in units:
unit = unit.strip()
if not unit:
continue
# 识别拼块类型
piece_type = self._get_piece_type(unit)
# 生成唯一ID
piece_id = f"piece_{uuid.uuid4().hex[:8]}"
pieces.append({
"piece_id": piece_id,
"content": unit,
"type": piece_type,
"line": line
})
return pieces
def annotate_level(self, pieces):
"""
标注8个三体系层级
:param pieces: 拼块列表
:return: 带层级标注的拼块列表
"""
annotated_pieces = []
for piece in pieces:
content = piece["content"]
# 识别层级
level = self._get_level(content)
# 识别位置/藏/悬置
if level == "总纲":
position = "顶"
is_hide = False
is_suspend = False
elif level == "总目":
position = "维度abc"
is_hide = False
is_suspend = True
elif level == "总科":
position = "层次123"
is_hide = True
is_suspend = False
elif level == "总属":
position = "底"
is_hide = True
is_suspend = False
else:
position = ""
is_hide = False
is_suspend = False
# 检索类型
search_type = "关键词检索" if is_hide else "提示词导航"
# 合并标注
piece["level"] = level
piece["position"] = position
piece["is_hide"] = is_hide
piece["is_suspend"] = is_suspend
piece["search_type"] = search_type
annotated_pieces.append(piece)
return annotated_pieces
def bind_grid(self, pieces):
"""
绑定三套九宫格位置
:param pieces: 带层级标注的拼块列表
:return: 带九宫格绑定的拼块列表
"""
grid_pieces = []
for piece in pieces:
# 确定九宫格类型(中/外/内)
if piece["level"] in ["总纲", "总目"]:
grid_type = "中" # 地位表
elif piece["level"] == "总科":
grid_type = "内" # 身份表
elif piece["level"] == "总属":
grid_type = "外" # 角色表
else:
grid_type = "中" # 默认中九宫格
# 确定列(a/b/c)
if piece["type"] == "角":
col = "a"
elif piece["type"] == "边":
col = "b"
elif piece["type"] == "料":
col = "c"
else:
col = "c"
# 确定行(1/2/3)
if piece["level"] in ["总纲", "总目"]:
row = "1"
elif piece["level"] == "总科":
row = "2"
elif piece["level"] == "总属":
row = "3"
else:
row = "1"
# 生成九宫格ID
grid_id = f"{grid_type}-{col}{row}_{piece['piece_id']}"
# 九宫格规则属性
grid_attr = GRID_RULES["grid_type"][grid_type]
# 合并绑定信息
piece["grid_type"] = grid_type
piece["grid_col"] = col
piece["grid_row"] = row
piece["grid_id"] = grid_id
piece["font"] = grid_attr["font"]
piece["call_type"] = grid_attr["call_type"]
piece["num_system"] = grid_attr["num_system"]
piece["quant_dim"] = grid_attr["quant_dim"]
grid_pieces.append(piece)
return grid_pieces
def deep_semantic(self, pieces):
"""
分词/分类/理解语义深化
:param pieces: 带九宫格绑定的拼块列表
:return: 带语义标注的拼块列表
"""
semantic_pieces = []
for piece in pieces:
content = piece["content"]
# 1. 分词:原/迹/本(本义/表面义/引申义)
if "本然" in content or "本质" in content:
word_type = "本" # 本义
elif "表面" in content or "迹" in content:
word_type = "迹" # 表面义
else:
word_type = "原" # 引申义
# 2. 分类:狭义/广义/绝对
if "生物" in content or "K常量" in content:
classify_type = "狭义"
param = "K常量"
language = "中文"
op = "正则式"
elif "系统" in content or "V变量" in content:
classify_type = "广义"
param = "V变量"
language = "英文"
op = "多项式"
elif "功能" in content or "不定量λ" in content:
classify_type = "绝对"
param = "不定量λ"
language = "聚合式K-V"
op = "行列式"
else:
classify_type = "狭义"
param = "K常量"
language = "中文"
op = "正则式"
# 3. 理解:关联宇宙
if "文字" in content or "概念" in content:
universe = "语言文字因子宇宙"
graph_type = "概念图"
elif "符号" in content or "逻辑" in content:
universe = "逻辑符号元素世界"
graph_type = "逻辑图"
elif "数字" in content or "数学" in content:
universe = "数学数字基因宇宙"
graph_type = "存在图"
else:
universe = "语言文字因子宇宙"
graph_type = "概念图"
# 合并语义信息
piece["word_type"] = word_type
piece["classify_type"] = classify_type
piece["param"] = param
piece["language"] = language
piece["op"] = op
piece["universe"] = universe
piece["graph_type"] = graph_type
semantic_pieces.append(piece)
return semantic_pieces
def map_text_symbol_number(self, pieces):
"""
文字→符号→数字映射
:param pieces: 带语义标注的拼块列表
:return: 带映射信息的拼块列表 + 映射表
"""
mapped_pieces = []
mapping_table = [] # 文字-符号-数字映射表
for piece in pieces:
content = piece["content"]
# 1. 文字→符号
symbol_name = TEXT_TO_SYMBOL.get(content, "")
# 2. 符号→数字
base_num = SYMBOL_TO_BASE_NUM.get(symbol_name, 0)
num_system = piece["num_system"]
# 量化维度值计算
if piece["quant_dim"] == "色度":
quant_value = base_num % 256 # 0-255
elif piece["quant_dim"] == "亮度":
quant_value = round(base_num / 100, 2) # 0-1
elif piece["quant_dim"] == "明度":
quant_value = base_num % 100 # 0-100
else:
quant_value = 0
# Time-Number-Truth
tnt = {
"Time": base_num,
"Number": base_num % 10,
"Truth": 1 if base_num > 0 else 0
}
# 合并映射信息
piece["symbol_name"] = symbol_name
piece["base_num"] = base_num
piece["quant_value"] = quant_value
piece["tnt"] = tnt
mapped_pieces.append(piece)
# 构建映射表
mapping_table.append({
"piece_id": piece["piece_id"],
"text_content": content,
"symbol_name": symbol_name,
"base_num": base_num,
"num_system": num_system,
"quant_dim": piece["quant_dim"],
"quant_value": quant_value
})
return mapped_pieces, mapping_table
def _get_piece_type(self, content):
"""识别拼块类型(料/边/角)"""
for ptype, keywords in self.piece_type_keywords.items():
if any(key in content for key in keywords):
return ptype
return "料" # 默认料
def _get_level(self, content):
"""识别8个三体系层级"""
for level, keywords in self.level_keywords.items():
if any(key in content for key in keywords):
return level
return "" # 无层级
# 补充缺失的import
import re
# 测试代码
if __name__ == "__main__":
parser = ParserCore()
# 测试拼块拆分
test_lines = ["总纲:三套九宫格,内容表:现象", "总科:角色表,认知内核:血液"]
pieces = parser.split_pieces(test_lines)
# 测试层级标注
annotated = parser.annotate_level(pieces)
# 测试九宫格绑定
grid_pieces = parser.bind_grid(annotated)
# 测试语义深化
semantic_pieces = parser.deep_semantic(grid_pieces)
# 测试映射
mapped_pieces, mapping = parser.map_text_symbol_number(semantic_pieces)
print("映射表:", mapping[0])
2.4 校验模块(core/validator.py)
python
python
"""解析结果校验模块:按原文档规则校验"""
from config.symbol_mapping import GRID_RULES
class Validator:
def __init__(self):
self.col_rule = GRID_RULES["col_rule"]
self.row_rule = GRID_RULES["row_rule"]
self.grid_type_rule = GRID_RULES["grid_type"]
def validate_all(self, pieces):
"""
全量校验
:param pieces: 带映射信息的拼块列表
:return: 校验结果 {"is_pass": True/False, "errors": []}
"""
errors = []
for piece in pieces:
# 1. 九宫格列绑定校验
col_error = self._validate_col(piece)
if col_error:
errors.append(col_error)
# 2. 九宫格行绑定校验
row_error = self._validate_row(piece)
if row_error:
errors.append(row_error)
# 3. 太乙/道/名规则校验
taiyi_error = self._validate_taiyi_dao_ming(piece)
if taiyi_error:
errors.append(taiyi_error)
# 4. 映射完整性校验
map_error = self._validate_mapping(piece)
if map_error:
errors.append(map_error)
return {
"is_pass": len(errors) == 0,
"errors": errors
}
def _validate_col(self, piece):
"""列绑定校验:a=角/b=边/c=料"""
col = piece["grid_col"]
piece_type = piece["type"]
expected_type = self.col_rule.get(col, "")
if expected_type and piece_type != expected_type:
return f"拼块{piece['piece_id']}列绑定错误:{col}列应绑定{expected_type},实际{piece_type}"
return ""
def _validate_row(self, piece):
"""行绑定校验:1=总纲/总目,2=总科,3=总属"""
row = piece["grid_row"]
level = piece["level"]
expected_levels = self.row_rule.get(row, [])
if expected_levels and level not in expected_levels:
return f"拼块{piece['piece_id']}行绑定错误:{row}行应绑定{expected_levels},实际{level}"
return ""
def _validate_taiyi_dao_ming(self, piece):
"""太乙/道/名规则校验:字体+调用类型"""
grid_type = piece["grid_type"]
expected_rule = self.grid_type_rule.get(grid_type, {})
if not expected_rule:
return ""
# 字体校验
if piece["font"] != expected_rule["font"]:
return f"拼块{piece['piece_id']}字体错误:{grid_type}九宫格应使用{expected_rule['font']},实际{piece['font']}"
# 调用类型校验
if piece["call_type"] != expected_rule["call_type"]:
return f"拼块{piece['piece_id']}调用类型错误:{grid_type}九宫格应使用{expected_rule['call_type']},实际{piece['call_type']}"
return ""
def _validate_mapping(self, piece):
"""映射完整性校验:符号+数字不能为空(核心术语)"""
core_keywords = ["法身", "报身", "应身", "九宫格", "正则式求是"]
if any(key in piece["content"] for key in core_keywords):
if not piece["symbol_name"]:
return f"拼块{piece['piece_id']}符号映射缺失:核心术语{piece['content']}未映射到符号"
if piece["base_num"] == 0:
return f"拼块{piece['piece_id']}数字映射缺失:核心术语{piece['content']}数字值为0"
return ""
# 测试代码
if __name__ == "__main__":
validator = Validator()
# 构造测试拼块
test_piece = {
"piece_id": "test123",
"grid_col": "a",
"type": "边", # 错误:a列应绑定角
"grid_row": "1",
"level": "总科", # 错误:1行应绑定总纲/总目
"grid_type": "中",
"font": "斜体", # 错误:中九宫格应使用宋体
"call_type": "即时at-a-time", # 错误:中九宫格应延时
"content": "法身",
"symbol_name": "", # 错误:核心术语无符号
"base_num": 0
}
result = validator.validate_all([test_piece])
print("校验结果:", result["errors"])
2.5 输出层模块(core/output_handler.py)
python
python
"""输出层:结构化数据生成+多格式导出"""
import json
import os
import csv
import pymysql
from openpyxl import Workbook
from config.db_config import DB_CONFIG, CREATE_TABLE_SQLS
class OutputHandler:
def __init__(self, output_dir="output"):
self.output_dir = output_dir
# 创建输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 数据库连接
self.db_conn = None
self.db_cursor = None
def export_json(self, pieces, mapping_table, filename="parser_result.json"):
"""导出JSON格式"""
output_data = {
"pieces": pieces,
"mapping_table": mapping_table
}
file_path = os.path.join(self.output_dir, filename)
with open(file_path, "w", encoding="utf-8") as f:
json.dump(output_data, f, ensure_ascii=False, indent=4)
return file_path
def export_excel(self, pieces, mapping_table, filename="parser_result.xlsx"):
"""导出Excel格式(九宫格表+映射表)"""
wb = Workbook()
# 1. 九宫格工作表
ws_grid = wb.active
ws_grid.title = "三套九宫格"
# 表头
grid_headers = ["拼块ID", "内容", "类型", "层级", "九宫格类型", "列", "行", "符号", "数字值", "数制", "量化维度", "量化值"]
ws_grid.append(grid_headers)
# 数据行
for piece in pieces:
ws_grid.append([
piece["piece_id"],
piece["content"],
piece["type"],
piece["level"],
piece["grid_type"],
piece["grid_col"],
piece["grid_row"],
piece["symbol_name"],
piece["base_num"],
piece["num_system"],
piece["quant_dim"],
piece["quant_value"]
])
# 2. 映射表工作表
ws_mapping = wb.create_sheet(title="文字-符号-数字映射")
mapping_headers = ["拼块ID", "文字内容", "符号名称", "基础数字", "数制", "量化维度", "量化值"]
ws_mapping.append(mapping_headers)
for item in mapping_table:
ws_mapping.append([
item["piece_id"],
item["text_content"],
item["symbol_name"],
item["base_num"],
item["num_system"],
item["quant_dim"],
item["quant_value"]
])
# 保存文件
file_path = os.path.join(self.output_dir, filename)
wb.save(file_path)
return file_path
def export_csv(self, mapping_table, filename="mapping_table.csv"):
"""导出CSV格式映射表"""
file_path = os.path.join(self.output_dir, filename)
with open(file_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=mapping_table[0].keys())
writer.writeheader()
writer.writerows(mapping_table)
return file_path
def export_mysql(self, pieces, mapping_table):
"""导出到MySQL数据库"""
try:
# 连接数据库
self.db_conn = pymysql.connect(**DB_CONFIG)
self.db_cursor = self.db_conn.cursor()
# 1. 创建数据库(如果不存在)
self.db_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {DB_CONFIG['db']}")
# 2. 创建表
for table_name, create_sql in CREATE_TABLE_SQLS.items():
self.db_cursor.execute(create_sql)
# 3. 插入九宫格数据
self._insert_grid_data(pieces)
# 4. 插入映射表数据
self._insert_mapping_data(mapping_table)
self.db_conn.commit()
return "数据库导出成功"
except Exception as e:
self.db_conn.rollback()
raise RuntimeError(f"数据库导出失败:{str(e)}")
finally:
if self.db_conn:
self.db_conn.close()
def _insert_grid_data(self, pieces):
"""插入九宫格数据"""
insert_sql = """
INSERT INTO core_three_grid (grid_id, grid_name, piece_id, piece_content, symbol_name, number_value, num_system, quant_dim, font, call_type)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE piece_content = VALUES(piece_content)
"""
for piece in pieces:
grid_name = f"{piece['grid_type']}-{piece['grid_col']}{piece['grid_row']}"
self.db_cursor.execute(insert_sql, (
piece["grid_id"],
grid_name,
piece["piece_id"],
piece["content"],
piece["symbol_name"],
piece["base_num"],
piece["num_system"],
piece["quant_dim"],
piece["font"],
piece["call_type"]
))
def _insert_mapping_data(self, mapping_table):
"""插入映射表数据"""
insert_sql = """
INSERT INTO text_symbol_mapping (text_content, symbol_name, base_num)
VALUES (%s, %s, %s)
ON DUPLICATE KEY UPDATE symbol_name = VALUES(symbol_name), base_num = VALUES(base_num)
"""
for item in mapping_table:
self.db_cursor.execute(insert_sql, (
item["text_content"],
item["symbol_name"],
item["base_num"]
))
# 测试代码
if __name__ == "__main__":
output_handler = OutputHandler()
# 构造测试数据
test_pieces = [{
"piece_id": "test123",
"content": "法身",
"type": "角",
"level": "总纲",
"grid_type": "中",
"grid_col": "a",
"grid_row": "1",
"symbol_name": "SYMBOL_BODY_LAW",
"base_num": 10,
"num_system": 2,
"quant_dim": "色度",
"quant_value": 10,
"font": "宋体",
"call_type": "延时by-times",
"grid_id": "中-a1_test123"
}]
test_mapping = [{
"piece_id": "test123",
"text_content": "法身",
"symbol_name": "SYMBOL_BODY_LAW",
"base_num": 10,
"num_system": 2,
"quant_dim": "色度",
"quant_value": 10
}]
# 测试JSON导出
json_path = output_handler.export_json(test_pieces, test_mapping)
print("JSON导出路径:", json_path)
# 测试Excel导出
excel_path = output_handler.export_excel(test_pieces, test_mapping)
print("Excel导出路径:", excel_path)
# 测试CSV导出
csv_path = output_handler.export_csv(test_mapping)
print("CSV导出路径:", csv_path)
3. GUI 开发完整代码
3.1 主界面代码(gui/main_window.py)
python
python
"""GUI主界面:PyQt5实现"""
import sys
import os
import logging
from PyQt5.QtWidgets import (
QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
QPushButton, QLabel, QLineEdit, QFileDialog, QTextEdit,
QProgressBar, QTabWidget, QComboBox, QMessageBox, QGroupBox
)
from PyQt5.QtCore import Qt, QThread, pyqtSignal, QTimer
from core.input_handler import InputHandler
from core.parser_core import ParserCore
from core.validator import Validator
from core.output_handler import OutputHandler
from config.db_config import DB_CONFIG
# 配置日志
logging.basicConfig(
filename="logs/parser_tool.log",
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
class ParserThread(QThread):
"""解析线程(避免界面卡死)"""
progress_signal = pyqtSignal(int, str) # 进度,当前步骤
result_signal = pyqtSignal(list, list) # 拼块结果,映射表
error_signal = pyqtSignal(str) # 错误信息
def __init__(self, file_path):
super().__init__()
self.file_path = file_path
def run(self):
try:
# 1. 输入处理
self.progress_signal.emit(10, "解析文档并归一化")
input_handler = InputHandler()
raw_content = input_handler.parse_document(self.file_path)
normalized_lines = input_handler.normalize_content(raw_content)
# 2. 核心解析
self.progress_signal.emit(30, "拆分文字拼块")
parser = ParserCore()
pieces = parser.split_pieces(normalized_lines)
self.progress_signal.emit(40, "标注8个三体系层级")
annotated_pieces = parser.annotate_level(pieces)
self.progress_signal.emit(50, "绑定三套九宫格位置")
grid_pieces = parser.bind_grid(annotated_pieces)
self.progress_signal.emit(60, "语义深化处理")
semantic_pieces = parser.deep_semantic(grid_pieces)
self.progress_signal.emit(70, "文字→符号→数字映射")
mapped_pieces, mapping_table = parser.map_text_symbol_number(semantic_pieces)
self.progress_signal.emit(80, "解析完成")
self.result_signal.emit(mapped_pieces, mapping_table)
except Exception as e:
self.error_signal.emit(f"解析失败:{str(e)}")
logging.error(f"解析失败:{str(e)}")
class ExportThread(QThread):
"""导出线程"""
progress_signal = pyqtSignal(int, str)
finish_signal = pyqtSignal(str)
error_signal = pyqtSignal(str)
def __init__(self, export_type, pieces, mapping_table, db_config=None):
super().__init__()
self.export_type = export_type
self.pieces = pieces
self.mapping_table = mapping_table
self.db_config = db_config or DB_CONFIG
def run(self):
try:
output_handler = OutputHandler()
if self.export_type == "JSON":
self.progress_signal.emit(20, "生成JSON数据")
path = output_handler.export_json(self.pieces, self.mapping_table)
self.finish_signal.emit(f"JSON导出成功:{path}")
elif self.export_type == "Excel":
self.progress_signal.emit(20, "生成Excel表格")
path = output_handler.export_excel(self.pieces, self.mapping_table)
self.finish_signal.emit(f"Excel导出成功:{path}")
elif self.export_type == "CSV":
self.progress_signal.emit(20, "生成CSV文件")
path = output_handler.export_csv(self.mapping_table)
self.finish_signal.emit(f"CSV导出成功:{path}")
elif self.export_type == "MySQL":
self.progress_signal.emit(20, "连接数据库")
# 更新数据库配置
global DB_CONFIG
DB_CONFIG.update(self.db_config)
msg = output_handler.export_mysql(self.pieces, self.mapping_table)
self.finish_signal.emit(f"MySQL导出成功:{msg}")
else:
self.error_signal.emit("不支持的导出格式")
except Exception as e:
self.error_signal.emit(f"导出失败:{str(e)}")
logging.error(f"导出失败:{str(e)}")
class DocParserWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("文档解析辅助工具")
self.setGeometry(100, 100, 1200, 800)
# 全局变量
self.file_path = ""
self.mapped_pieces = []
self.mapping_table = []
self.parser_thread = None
self.export_thread = None
# 初始化界面
self._init_ui()
# 初始化日志
if not os.path.exists("logs"):
os.makedirs("logs")
def _init_ui(self):
"""初始化界面布局"""
central_widget = QWidget()
self.setCentralWidget(central_widget)
main_layout = QVBoxLayout(central_widget)
# 1. 文档选择区
file_group = QGroupBox("文档选择")
file_layout = QHBoxLayout()
self.file_label = QLabel("未选择文件")
file_btn = QPushButton("选择文档")
file_btn.clicked.connect(self._select_file)
file_layout.addWidget(self.file_label)
file_layout.addWidget(file_btn)
file_group.setLayout(file_layout)
main_layout.addWidget(file_group)
# 2. 解析控制区
parse_group = QGroupBox("解析控制")
parse_layout = QHBoxLayout()
self.parse_btn = QPushButton("开始解析")
self.parse_btn.clicked.connect(self._start_parse)
self.parse_btn.setEnabled(False)
self.progress_bar = QProgressBar()
self.progress_bar.setRange(0, 100)
self.step_label = QLabel("当前步骤:未开始")
parse_layout.addWidget(self.parse_btn)
parse_layout.addWidget(self.progress_bar)
parse_layout.addWidget(self.step_label)
parse_group.setLayout(parse_layout)
main_layout.addWidget(parse_group)
# 3. 结果展示区
result_tab = QTabWidget()
# 3.1 校验结果页
self.validate_text = QTextEdit()
self.validate_text.setReadOnly(True)
result_tab.addTab(self.validate_text, "校验结果")
# 3.2 九宫格结果页
self.grid_text = QTextEdit()
self.grid_text.setReadOnly(True)
result_tab.addTab(self.grid_text, "九宫格解析结果")
# 3.3 映射表页
self.mapping_text = QTextEdit()
self.mapping_text.setReadOnly(True)
result_tab.addTab(self.mapping_text, "文字-符号-数字映射表")
main_layout.addWidget(result_tab)
# 4. 导出设置区
export_group = QGroupBox("导出设置")
export_layout = QVBoxLayout()
# 4.1 导出格式选择
format_layout = QHBoxLayout()
format_label = QLabel("导出格式:")
self.format_combo = QComboBox()
self.format_combo.addItems(["JSON", "Excel", "CSV", "MySQL"])
format_layout.addWidget(format_label)
format_layout.addWidget(self.format_combo)
# 4.2 数据库配置(MySQL时显示)
self.db_group = QGroupBox("数据库配置")
db_layout = QFormLayout()
self.db_host = QLineEdit(DB_CONFIG["host"])
self.db_user = QLineEdit(DB_CONFIG["user"])
self.db_pwd = QLineEdit(DB_CONFIG["password"])
self.db_pwd.setEchoMode(QLineEdit.Password)
self.db_name = QLineEdit(DB_CONFIG["db"])
db_layout.addRow("主机:", self.db_host)
db_layout.addRow("用户名:", self.db_user)
db_layout.addRow("密码:", self.db_pwd)
db_layout.addRow("数据库名:", self.db_name)
self.db_group.setLayout(db_layout)
self.db_group.setVisible(False)
# 4.3 导出按钮
export_btn_layout = QHBoxLayout()
self.export_btn = QPushButton("开始导出")
self.export_btn.clicked.connect(self._start_export)
self.export_btn.setEnabled(False)
self.export_progress = QProgressBar()
self.export_progress.setRange(0, 100)
export_btn_layout.addWidget(self.export_btn)
export_btn_layout.addWidget(self.export_progress)
# 4.4 导出日志
self.export_log = QTextEdit()
self.export_log.setReadOnly(True)
self.export_log.setMaximumHeight(100)
# 组装导出布局
export_layout.addLayout(format_layout)
export_layout.addWidget(self.db_group)
export_layout.addLayout(export_btn_layout)
export_layout.addWidget(self.export_log)
export_group.setLayout(export_layout)
main_layout.addWidget(export_group)
# 绑定格式选择事件
self.format_combo.currentTextChanged.connect(self._on_format_change)
def _select_file(self):
"""选择文档"""
file_path, _ = QFileDialog.getOpenFileName(
self, "选择文档", "", "所有支持格式 (*.txt *.docx *.md);;TXT文件 (*.txt);;DOCX文件 (*.docx);;MD文件 (*.md)"
)
if file_path:
self.file_path = file_path
self.file_label.setText(f"已选择:{os.path.basename(file_path)}")
self.parse_btn.setEnabled(True)
self.validate_text.clear()
self.grid_text.clear()
self.mapping_text.clear()
logging.info(f"选择文档:{file_path}")
def _start_parse(self):
"""开始解析"""
self.parse_btn.setEnabled(False)
self.progress_bar.setValue(0)
self.step_label.setText("当前步骤:解析文档并归一化")
# 创建解析线程
self.parser_thread = ParserThread(self.file_path)
self.parser_thread.progress_signal.connect(self._update_parse_progress)
self.parser_thread.result_signal.connect(self._on_parse_finish)
self.parser_thread.error_signal.connect(self._on_parse_error)
self.parser_thread.start()
def _update_parse_progress(self, progress, step):
"""更新解析进度"""
self.progress_bar.setValue(progress)
self.step_label.setText(f"当前步骤:{step}")
def _on_parse_finish(self, pieces, mapping_table):
"""解析完成"""
self.mapped_pieces = pieces
self.mapping_table = mapping_table
self.progress_bar.setValue(100)
self.step_label.setText("当前步骤:解析完成")
# 校验结果
validator = Validator()
validate_result = validator.validate_all(pieces)
# 显示校验结果
if validate_result["is_pass"]:
self.validate_text.setText("✅ 校验通过:所有解析结果符合原文档规则")
self.validate_text.setTextColor(Qt.green)
self.export_btn.setEnabled(True)
else:
error_text = "❌ 校验失败:\n" + "\n".join(validate_result["errors"])
self.validate_text.setText(error_text)
self.validate_text.setTextColor(Qt.red)
# 显示九宫格结果
grid_text = ""
for piece in pieces[:10]: # 只显示前10条
grid_text += f"拼块ID:{piece['piece_id']}\n"
grid_text += f"内容:{piece['content']}\n"
grid_text += f"九宫格:{piece['grid_type']}-{piece['grid_col']}{piece['grid_row']}\n"
grid_text += f"符号:{piece['symbol_name']} | 数字:{piece['base_num']}\n"
grid_text += "------------------------\n"
self.grid_text.setText(grid_text)
# 显示映射表
mapping_text = ""
for item in mapping_table[:10]:
mapping_text += f"文字:{item['text_content']}\n"
mapping_text += f"符号:{item['symbol_name']}\n"
mapping_text += f"数字:{item['base_num']}(数制:{item['num_system']})\n"
mapping_text += "------------------------\n"
self.mapping_text.setText(mapping_text)
logging.info("解析完成,共处理{}个拼块".format(len(pieces)))
def _on_parse_error(self, error):
"""解析错误"""
self.progress_bar.setValue(0)
self.step_label.setText("当前步骤:解析失败")
self.validate_text.setText(error)
self.validate_text.setTextColor(Qt.red)
self.parse_btn.setEnabled(True)
QMessageBox.critical(self, "错误", error)
def _on_format_change(self, format):
"""格式选择变化"""
if format == "MySQL":
self.db_group.setVisible(True)
else:
self.db_group.setVisible(False)
def _start_export(self):
"""开始导出"""
if not self.mapped_pieces:
QMessageBox.warning(self, "警告", "请先完成解析")
return
self.export_btn.setEnabled(False)
self.export_progress.setValue(0)
self.export_log.clear()
# 获取导出格式
export_type = self.format_combo.currentText()
# 构建数据库配置
db_config = {
"host": self.db_host.text(),
"user": self.db_user.text(),
"password": self.db_pwd.text(),
"db": self.db_name.text(),
"charset": "utf8mb4"
}
# 创建导出线程
self.export_thread = ExportThread(export_type, self.mapped_pieces, self.mapping_table, db_config)
self.export_thread.progress_signal.connect(self._update_export_progress)
self.export_thread.finish_signal.connect(self._on_export_finish)
self.export_thread.error_signal.connect(self._on_export_error)
self.export_thread.start()
def _update_export_progress(self, progress, step):
"""更新导出进度"""
self.export_progress.setValue(progress)
self.export_log.append(f"导出中:{step}")
def _on_export_finish(self, msg):
"""导出完成"""
self.export_progress.setValue(100)
self.export_log.append(msg)
self.export_log.setTextColor(Qt.green)
self.export_btn.setEnabled(True)
logging.info(msg)
def _on_export_error(self, error):
"""导出错误"""
self.export_progress.setValue(0)
self.export_log.append(error)
self.export_log.setTextColor(Qt.red)
self.export_btn.setEnabled(True)
QMessageBox.critical(self, "错误", error)
# 启动GUI
def start_gui():
app = QApplication(sys.argv)
window = DocParserWindow()
window.show()
sys.exit(app.exec_())
if __name__ == "__main__":
start_gui()
3.2 工具入口(main.py)
python
python
"""文档解析辅助工具 - 入口文件"""
import sys
from gui.main_window import start_gui
if __name__ == "__main__":
# 设置PyQt5中文显示
from PyQt5.QtWidgets import QApplication
QApplication.setStyle("Fusion")
# 启动GUI
start_gui()
4. 单元测试完整代码
4.1 输入层测试(tests/test_input.py)
python
python
import unittest
import os
from core.input_handler import InputHandler
class TestInputHandler(unittest.TestCase):
def setUp(self):
self.handler = InputHandler()
# 创建测试文件
self.test_txt = "test_input.txt"
with open(self.test_txt, "w", encoding="utf-8") as f:
f.write("总纲:三套九宫格\n页脚:2026\n法身:本然清洁\n注释:测试注释\n")
def test_parse_txt(self):
"""测试TXT解析"""
content = self.handler.parse_document(self.test_txt)
self.assertIn("总纲:三套九宫格", content)
self.assertIn("法身:本然清洁", content)
def test_normalize_content(self):
"""测试内容归一化"""
raw_content = self.handler.parse_document(self.test_txt)
normalized = self.handler.normalize_content(raw_content)
# 过滤了页脚和注释
self.assertEqual(len(normalized), 2)
self.assertEqual(normalized[0], "总纲:三套九宫格")
self.assertEqual(normalized[1], "法身:本然清洁")
def tearDown(self):
"""清理测试文件"""
if os.path.exists(self.test_txt):
os.remove(self.test_txt)
if __name__ == "__main__":
unittest.main()
4.2 核心解析层测试(tests/test_parser.py)
python
python
import unittest
from core.parser_core import ParserCore
class TestParserCore(unittest.TestCase):
def setUp(self):
self.parser = ParserCore()
self.test_lines = ["总纲:三套九宫格,内容表:现象", "总科:角色表,认知内核:血液"]
def test_split_pieces(self):
"""测试拼块拆分"""
pieces = self.parser.split_pieces(self.test_lines)
self.assertEqual(len(pieces), 4) # 按标点拆分4个单元
self.assertEqual(pieces[0]["type"], "料") # 总纲→料
self.assertEqual(pieces[2]["type"], "角") # 认知内核→角
def test_annotate_level(self):
"""测试层级标注"""
pieces = self.parser.split_pieces(self.test_lines)
annotated = self.parser.annotate_level(pieces)
self.assertEqual(annotated[0]["level"], "总纲")
self.assertEqual(annotated[2]["level"], "总科")
self.assertTrue(annotated[2]["is_hide"]) # 总科→藏
def test_bind_grid(self):
"""测试九宫格绑定"""
pieces = self.parser.split_pieces(self.test_lines)
annotated = self.parser.annotate_level(pieces)
grid_pieces = self.parser.bind_grid(annotated)
self.assertEqual(grid_pieces[0]["grid_type"], "中") # 总纲→中九宫格
self.assertEqual(grid_pieces[2]["grid_type"], "内") # 总科→内九宫格
self.assertEqual(grid_pieces[0]["grid_col"], "c") # 料→c列
self.assertEqual(grid_pieces[2]["grid_col"], "a") # 角→a列
if __name__ == "__main__":
unittest.main()
4.3 输出层测试(tests/test_output.py)
python
python
import unittest
import os
from core.output_handler import OutputHandler
class TestOutputHandler(unittest.TestCase):
def setUp(self):
self.output_handler = OutputHandler("test_output")
# 构造测试数据
self.test_pieces = [{
"piece_id": "test123",
"content": "法身",
"type": "角",
"level": "总纲",
"grid_type": "中",
"grid_col": "a",
"grid_row": "1",
"symbol_name": "SYMBOL_BODY_LAW",
"base_num": 10,
"num_system": 2,
"quant_dim": "色度",
"quant_value": 10,
"font": "宋体",
"call_type": "延时by-times",
"grid_id": "中-a1_test123"
}]
self.test_mapping = [{
"piece_id": "test123",
"text_content": "法身",
"symbol_name": "SYMBOL_BODY_LAW",
"base_num": 10,
"num_system": 2,
"quant_dim": "色度",
"quant_value": 10
}]
def test_export_json(self):
"""测试JSON导出"""
path = self.output_handler.export_json(self.test_pieces, self.test_mapping)
self.assertTrue(os.path.exists(path))
# 清理文件
os.remove(path)
def test_export_excel(self):
"""测试Excel导出"""
path = self.output_handler.export_excel(self.test_pieces, self.test_mapping)
self.assertTrue(os.path.exists(path))
os.remove(path)
def tearDown(self):
"""清理测试目录"""
if os.path.exists("test_output"):
import shutil
shutil.rmtree("test_output")
if __name__ == "__main__":
unittest.main()
5. 打包与部署说明
5.1 打包为可执行文件(Windows)
bash
bash
# 安装pyinstaller
pip install pyinstaller
# 打包命令(单文件+窗口模式)
pyinstaller -F -w -i icon.ico main.py --name 文档解析辅助工具
# 说明:
# -F:打包为单个可执行文件
# -w:窗口模式(无控制台)
# -i:指定图标(可选)
# --name:指定生成的exe文件名
5.2 部署说明
- 环境要求:Windows/macOS/Linux,Python 3.9+(源码运行)或直接运行 exe 文件(Windows);
- MySQL 部署:若使用 MySQL 导出功能,需提前安装 MySQL 并创建用户,赋予数据库创建 / 表操作权限;
- 目录结构 :运行工具后自动生成
logs/(日志)、output/(输出文件)目录; - 使用流程 :
- 运行
文档解析辅助工具.exe(Windows)或python main.py(源码); - 点击「选择文档」,选择 TXT/MD/DOCX 格式的设计文档;
- 点击「开始解析」,等待解析完成;
- 查看校验结果,校验通过后选择导出格式;
- 点击「开始导出」,导出结果到
output/目录。
- 运行
总结
核心实现要点
- 规则严格对齐:所有解析逻辑、映射规则均严格贴合原设计文档,确保无偏差;
- 分层解耦设计:输入 / 解析 / 输出 / 校验模块独立,便于维护和扩展;
- 轻量易落地:基于 Python + 轻量库开发,1-2 周可完成开发 + 测试;
- 全格式支持:输入支持 TXT/MD/DOCX,输出支持 JSON/Excel/CSV/MySQL;
- 可视化交互:PyQt5 GUI 界面,操作简单,结果可视化展示。
后续扩展建议
- 新增规则配置界面:支持可视化配置解析规则,无需修改代码;
- 新增批量解析功能:支持多文档批量解析;
- 优化语义解析精度:集成轻量 NLP 模型(如 jieba 分词)提升拼块拆分精度;
- 新增规则版本管理:支持不同版本设计文档的解析规则切换。
该手册包含工具开发所需的全部核心代码,开发人员可直接基于手册完成编码、测试、打包、部署,快速落地文档解析辅助工具,解决程序 "读对文档" 的核心问题。
- 详细编码实现手册中每个子模块的完整伪代码
- 核心函数代码
- GUI 开发代码