前言
在日常工作中,我们经常需要处理Word文档,特别是从中提取关键信息,如标题、段落等。今天,我们将利用Python来实现这一功能,并为大家提供一段完整的代码示例。
准备工作
首先,你需要安装python-docx库,这是一个用于处理Word文档的Python库和collections库,用于数据去重
。你可以使用以下命令安装它:
bash
pip install python-docx collections
代码讲解
下面是实现该功能的完整代码:
python
#!/usr/bin/env python3
# coding:utf-8
import re
import docx
import os
from collections import OrderedDict
interfere = []
garde = []
'''判断依据'''
first_title = []
first_title1 = []
first_title2 = []
kew_word = ['是', '要']
one_characters = ['①', '②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '⑪', '⑫', '⑬', '⑭', '⑮', '⑯', '⑰', '⑱', '⑲', '⑳', '第', '甲', '乙', '丙', '---', 'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅴ', 'Ⅵ', 'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅹ']
two_characters = []
three_characters = []
four_characters = []
five_characters = []
year = []
number1 = []
number2 = []
spell = []
spell1 = ['①', '②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '⑪', '⑫', '⑬', '⑭', '⑮', '⑯', '⑰', '⑱', '⑲', '⑳', 'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅴ', 'Ⅵ', 'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅹ']
def main():
creat_role()
path = r'./test' # 测试文件夹
file_name = os.listdir(path)
for i in file_name:
file_path = path + '\\' + i
content = read_data(file_path)
# print(content)
title = get_title(content)
get_second_title(title)
garde_unique = list(OrderedDict.fromkeys(garde))
for i in garde_unique:
print(i)
def read_data(filename):
content = []
doc = docx.Document(filename)
for para in doc.paragraphs:
for r in para.runs:
if r.bold and 5 < len(r.text) < 50 and r.text[-1] != ',':
content.append(r.text + 'label')
else:
content.append(para.text)
return content
def get_title(ls):
title = []
for i in ls:
i = re.sub('\s', '', i)
if len(i) == 0:
continue
else:
if i[:1] in one_characters or i[:2] in two_characters or i[:3] in three_characters or i[:4] in four_characters or i[:5] in five_characters or i[:-5] == 'label':
title.append(i)
return title
def get_second_title(ls):
label = ['<first>', '<second>', '<third>', '<fourth>', '<fifth>']
mark = []
number = []
for i in ls:
if i[1] == '年' and i[3] != ',':
continue
elif str(i[:4]) in year:
continue
elif i[:2] == '一九' or i[:2] == '二零':
continue
elif i[:2] in first_title or i[:3] in first_title1 or i[:4] in first_title2:
if '。' in i:
garde.append(i.split('。')[0] + label[0])
else:
garde.append(i + label[0])
mark = []
else:
if i[0] in interfere:
mark.append('一')
elif i[:1].isdecimal() == True:
mark.append('壹')
elif i[:2].isdecimal() == True:
mark.append('壹')
elif i[:1] in spell:
mark.append('A')
elif i[:1] in spell1:
mark.append('①')
else:
mark.append((i[:1]))
result = []
s = len(set(mark))
if s == 1:
garde.append(i.split('。')[0] + label[1])
elif s == 2:
ls1 = two_word(mark, result)
if i[:1] == ls1[0]:
garde.append(i.split('。')[0] + label[1])
else:
garde.append(i.split('。')[0] + label[2])
elif s == 3:
ls1 = two_word(mark, result)
if i[:1] == ls1[0]:
garde.append(i.split('。')[0] + label[1])
elif i[:1] == ls1[1]:
garde.append(i.split('。')[0] + label[2])
else:
garde.append(i.split('。')[0] + label[3])
elif s == 4:
ls1 = two_word(mark, result)
if i[:1] == ls1[0]:
garde.append(i.split('。')[0] + label[1])
elif i[:1] == ls1[1]:
garde.append(i.split('。')[0] + label[2])
elif i[:1] == ls1[2]:
garde.append(i.split('。')[0] + label[3])
else:
garde.append(i.split('。')[0] + label[4])
else:
pass
def two_word(ls, ls1):
for j in ls:
if j not in ls1:
ls1.append(j)
return ls1
def three_word(ls, ls1):
for j in ls:
if j not in ls1:
ls1.append(j)
return ls1
def four_word(ls, ls1):
for j in ls:
if j not in ls1:
ls1.append(j)
return ls1
def creat_role():
basic = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
'''添加中文序号'''
for chin in basic:
one_characters.append(chin)
interfere.append(chin)
first_title.append(chin + '、')
three_characters.append(f'({chin})')
three_characters.append(f'({chin})')
for chin1 in basic[:-1]:
two_characters.append(basic[-1] + chin1)
first_title1.append(basic[-1] + chin1 + '、')
four_characters.append(f'({basic[-1]}{chin1})')
four_characters.append(f'({basic[-1]}{chin1})')
for chin2 in basic[1:-1]:
two_characters.append(chin2 + basic[-1])
first_title1.append(chin2 + basic[-1] + '、')
four_characters.append(f'({chin2}{basic[-1]})')
for chin3 in basic[1:-1]:
for chin4 in basic[:-1]:
three_characters.append(chin3 + '十' + chin4)
first_title2.append(chin3 + '十' + chin4 + '、')
five_characters.append(f'({chin3}十{chin4})')
five_characters.append(f'({chin3}十{chin4})')
'''添加数字序号'''
for num in range(1, 10):
one_characters.append(str(num))
three_characters.append(f'({str(num)})')
three_characters.append((f'({str(num)})'))
number1.append(num)
for num in range(10, 100):
two_characters.append(str(num))
four_characters.append(f'({str(num)})')
four_characters.append(f'({str(num)})')
number2.append(num)
'''添加英文序号'''
for letter in range(97, 123):
one_characters.append(chr(letter))
three_characters.append(f'({chr(letter)})')
three_characters.append(f'({chr(letter)})')
spell.append(chr(letter))
for letter1 in range(65, 91):
one_characters.append(chr(letter1))
three_characters.append(f'({chr(letter1)})')
three_characters.append(f'({chr(letter1)})')
spell.append(chr(letter1))
'''添加年份'''
for i in range(1951, 2101):
year.append(str(i))
if __name__ == '__main__':
main()
代码解析
-
初始化判断依据 :我们定义了许多列表来存储不同类型的序号和干扰字符,如
one_characters
、two_characters
等。这些列表用于后续判断哪些是标题。 -
读取文档内容 :
read_data
函数用于读取Word文档的内容,并将加粗的文本标记为标题候选。 -
提取标题 :
get_title
函数根据一定规则从读取的内容中提取出可能的标题。 -
分类标题 :
get_second_title
函数进一步细分标题的层级,并添加相应的标签(如<first>
、<second>
等)。 -
创建角色 :
create_rolecreate
函数初始化标题判断依据,包括中文、数字、英文序号等。 -
去重:OrderedDict函数将重复添加的标题去掉。
项目体验
你可以在我的Gitee仓库中找到完整的项目代码,并下载到本地进行体验。点击以下链接访问项目:
希望这个教程对你有所帮助,如果有任何问题,欢迎在评论区留言讨论。