一个使用pyqt的word文档查重工具

使用场景
代码
使用截图
打包好的软件下载链接
结尾
使用场景

有时我们在借鉴一篇文档之后还不想有太多重复，这个时候可以使用这个工具对两个word文档进行对比
代码

python 复制代码
import sys
from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton, QVBoxLayout, QWidget, QLabel, QFileDialog
from docx import Document
import re, datetime


class WordComparerApp(QMainWindow):
    def __init__(self):
        super().__init__()

        self.initUI()

    def initUI(self):
        self.setWindowTitle('Word 文档比较器')
        self.setGeometry(100, 100, 400, 200)

        self.centralWidget = QWidget(self)
        self.setCentralWidget(self.centralWidget)

        self.layout = QVBoxLayout()

        self.file1_label = QLabel('选择文件1:')
        self.layout.addWidget(self.file1_label)

        self.file1_button = QPushButton('选择文件1')
        self.file1_button.clicked.connect(self.openFile1)
        self.layout.addWidget(self.file1_button)

        self.file2_label = QLabel('选择文件2:')
        self.layout.addWidget(self.file2_label)

        self.file2_button = QPushButton('选择文件2')
        self.file2_button.clicked.connect(self.openFile2)
        self.layout.addWidget(self.file2_button)

        self.compare_button = QPushButton('开始比较')
        self.compare_button.clicked.connect(self.compareFiles)
        self.layout.addWidget(self.compare_button)

        self.centralWidget.setLayout(self.layout)

    def openFile1(self):
        options = QFileDialog.Options()
        file1, _ = QFileDialog.getOpenFileName(self, "选择文件1", "", "Word Files (*.docx)", options=options)
        if file1:
            self.file1_label.setText(f'选择文件1: {file1}')
            self.file1 = file1

    def openFile2(self):
        options = QFileDialog.Options()
        file2, _ = QFileDialog.getOpenFileName(self, "选择文件2", "", "Word Files (*.docx)", options=options)
        if file2:
            self.file2_label.setText(f'选择文件2: {file2}')
            self.file2 = file2

    def compareFiles(self):
        if hasattr(self, 'file1') and hasattr(self, 'file2'):
            doc1 = self.readDocx(self.file1)
            doc2 = self.readDocx(self.file2)

            print('开始比对...'.center(80, '*'))
            t1 = datetime.datetime.now()
            for i in range(len(doc1)):
                if i % 100 == 0:
                    print('处理进行中，已处理段落 {0:>4d} (总数 {1:0>4d} ） '.format(i, len(doc1)))
                for j in range(len(doc2)):
                    self.compareParagraph(doc1, i, doc2, j)
            t2 = datetime.datetime.now()
            print('\n比对完成，总用时: ', t2 - t1)

    def getText(self, wordname):
        d = Document(wordname)
        texts = []
        for para in d.paragraphs:
            texts.append(para.text)
        return texts

    def msplit(self, s, separators=',|\.|\?|，|。|？|！'):
        return re.split(separators, s)

    def readDocx(self, docfile):
        print('*' * 80)
        print('文件', docfile, '加载中......')
        t1 = datetime.datetime.now()
        paras = self.getText(docfile)
        segs = []
        for p in paras:
            temp = []
            for s in self.msplit(p):
                if len(s) > 2:
                    temp.append(s.replace(' ', ""))
            if len(temp) > 0:
                segs.append(temp)
        t2 = datetime.datetime.now()
        print('加载完成，用时: ', t2 - t1)
        self.showInfo(segs, docfile)
        return segs

    def showInfo(self, doc, filename='filename'):
        chars = 0
        segs = 0
        for p in doc:
            for s in p:
                segs = segs + 1
                chars = chars + len(s)
        print('段落数: {0:>8d} 个。'.format(len(doc)))
        print('短句数: {0:>8d} 句。'.format(segs))
        print('字符数: {0:>8d} 个。'.format(chars))

    def compareParagraph(self, doc1, i, doc2, j, min_segment=5):
        p1 = doc1[i]
        p2 = doc2[j]
        len1 = sum([len(s) for s in p1])
        len2 = sum([len(s) for s in p2])
        if len1 < 10 or len2 < 10:
            return []

        lst = []
        for s1 in p1:
            if len(s1) < min_segment:
                continue
            for s2 in p2:
                if len(s2) < min_segment:
                    continue
                if s2 in s1:
                    lst.append(s2)
                elif s1 in s2:
                    lst.append(s1)

        count = sum([len(s) for s in lst])
        ratio = float(count) / min(len1, len2)
        if count > 10 and ratio > 0.1:
            print(' 发现相同内容 '.center(80, '*'))
            print('文件1第{0:0>4d}段内容：{1}'.format(i + 1, p1))
            print('文件2第{0:0>4d}段内容：{1}'.format(j + 1, p2))
            print('相同内容：', lst)
            print('相同字符比：{1:.2f}%\n相同字符数： {0}\n'.format(count, ratio * 100))
        return lst


def main():
    app = QApplication(sys.argv)
    ex = WordComparerApp()
    ex.show()
    sys.exit(app.exec_())


if __name__ == '__main__':
    main()
使用截图

打包好的软件下载链接

文档查重器
结尾

如果觉得文章对你有用请点赞、关注 ->> 你的点赞对我太有用了
群内交流更多技术
130856474 <-- 在这里