使用python自动提取文本关键词

使用殷本纪文字内容:汤出,见野张网四面,祝曰:"自天下四方皆入吾网。"汤曰:"嘻,尽之矣!"乃去其三面,祝曰:"欲左,左。欲右,右。不用命,乃入吾网。"诸侯闻之,曰:"汤德至矣,及禽兽。"........

复制代码
import jieba
import jieba.posseg as pseg
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from collections import defaultdict
import math
import os
from pyvis.network import Network
from networkx.algorithms import community


class PathVisualizer:
    def __init__(self, stopwords_path='stopwords.txt', window_size=5, damping=0.85):
        """初始化文本路径可视化器

        Args:
            stopwords_path: 停用词文件路径
            window_size: 共现窗口大小
            damping: 随机游走阻尼系数
        """
        self.window_size = window_size
        self.damping = damping
        self.stopwords = self._load_stopwords(stopwords_path)
        jieba.initialize()
        self.pos_filter = {'n', 'v', 'a', 'x'}  # 包含专业术语的词性过滤

    def _load_stopwords(self, path):
        """加载停用词表"""
        default_stopwords = {'的', '了', '在', '是', '和', '就', '不'}
        try:
            with open(path, 'r', encoding='utf-8') as f:
                return set([line.strip() for line in f]) | default_stopwords
        except FileNotFoundError:
            print(f"警告:未找到停用词文件 {path},使用默认停用词")
            return default_stopwords

    def _text_segment(self, text):
        """专业文本预处理"""
        words = []
        for word, pos in pseg.cut(text):
            if pos[0].lower() in self.pos_filter or 'x' in pos.lower():
                if word not in self.stopwords and len(word) > 1:
                    words.append(word)
        return words

    def _build_cooccurrence_graph(self, words):
        """构建多目标共现网络"""
        graph = nx.DiGraph()
        node_weights = defaultdict(int)

        for i in range(len(words)):
            window_start = max(0, i - self.window_size)
            current_word = words[i]
            node_weights[current_word] += 1 / (i + 1)  # 位置加权

            # 创建共现关系
            for j in range(window_start, i):
                prev_word = words[j]
                weight = 1 / (i - j)  # 距离衰减因子
                if graph.has_edge(prev_word, current_word):
                    graph[prev_word][current_word]['weight'] += weight
                else:
                    graph.add_edge(prev_word, current_word, weight=weight)

        # 添加节点属性
        nx.set_node_attributes(graph, dict(node_weights), 'weight')
        return graph

    def _classify_regions(self, graph):
        """使用Louvain算法进行社区划分"""
        undi_graph = graph.to_undirected()
        communities = community.louvain_communities(undi_graph, resolution=0.8)
        regions = {}
        for i, comm in enumerate(communities):
            for node in comm:
                regions[node] = chr(65 + i)  # 按社区编号转为字母
        return regions

    def visualize_network(self, graph, output_path, regions):
        """静态可视化方法"""
        plt.figure(figsize=(24, 14))

        # 优化布局算法
        pos = nx.kamada_kawai_layout(graph, scale=2)

        # 动态调整可视化参数
        node_sizes = [800 * math.log(graph.nodes[node]['weight'] + 1) for node in graph.nodes]
        edge_widths = [0.3 + 0.5 * math.log(graph[u][v]['weight'] + 1) for u, v in graph.edges]

        # 绘制节点(使用修正后的颜色格式)
        nx.draw_networkx_nodes(
            graph, pos,
            node_size=node_sizes,
            node_color=[self._region_color(regions[node]) for node in graph.nodes],
            edgecolors=(1, 1, 1, 0.8),  # 白色带透明度
            alpha=0.7
        )

        # 绘制边(使用RGBA元组格式)
        nx.draw_networkx_edges(
            graph, pos,
            width=edge_widths,
            edge_color=(0.5, 0.5, 0.5, 0.4),  # RGB(128,128,128)转换为归一化值
            arrowsize=15,
            alpha=0.4
        )

        # 添加智能标签
        self._draw_smart_labels(graph, pos)

        # 添加图例和注释
        self._add_legends()
        self._add_community_annotations(pos, regions)

        plt.axis('off')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()

    def _region_color(self, region):
        """区域颜色映射(使用Matplotlib兼容格式)"""
        colors = {
            'A': (0.18, 0.31, 0.31, 0.8),  # #2F4F4F -> RGBA
            'B': (0.47, 0.53, 0.60, 0.7),  # #778899
            'C': (0.66, 0.66, 0.66, 0.6),  # #A9A9A9
            'D': (0.83, 0.83, 0.83, 0.5),  # #D3D3D3
            'E': (0.94, 0.97, 1.00, 0.4)  # #F0F8FF
        }
        return colors.get(region, (1, 1, 1, 1))

    def _draw_smart_labels(self, graph, pos):
        """智能标签显示策略"""
        weights = np.array(list(nx.get_node_attributes(graph, 'weight').values()))
        threshold = np.percentile(weights, 75)

        # 仅标注重要节点
        labels = {node: node for node in graph.nodes if graph.nodes[node]['weight'] > threshold}
        nx.draw_networkx_labels(
            graph, pos, labels,
            font_size=10,
            font_family='SimHei',
            alpha=0.9,
            bbox=dict(facecolor=(1, 1, 1, 0.7), edgecolor='none')  # 半透明白色背景
        )

    def _add_legends(self):
        """添加自定义图例"""
        legend_elements = [
            plt.Line2D([0], [0], marker='o', color='w', label='核心节点',
                       markerfacecolor=(0.18, 0.31, 0.31, 0.8), markersize=15),
            plt.Line2D([0], [0], marker='o', color='w', label='次要节点',
                       markerfacecolor=(0.47, 0.53, 0.60, 0.7), markersize=12),
            plt.Line2D([0], [0], color=(0.5, 0.5, 0.5, 0.4), lw=3, label='关联强度')
        ]
        plt.legend(handles=legend_elements, loc='upper left', fontsize=12, framealpha=0.9)

    def _add_community_annotations(self, pos, regions):
        """添加社区标注(使用正确颜色格式)"""
        region_centers = defaultdict(list)
        for node, reg in regions.items():
            region_centers[reg].append(pos[node])

        for reg, coords in region_centers.items():
            x, y = zip(*coords)
            center = (np.mean(x), np.mean(y))
            plt.annotate(
                f'社区 {reg}',
                xy=center,
                xytext=(center[0], center[1] + 0.15),
                ha='center',
                fontsize=14,
                weight='bold',
                color=(0.80, 0.40, 0.40, 0.9),  # #FF6B6B -> RGBA
                arrowprops=dict(arrowstyle='->', color=(0.80, 0.40, 0.40, 0.7))
            )

    def generate_interactive_network(self, graph, regions, output_html):
        """生成交互式可视化(保持原始颜色格式)"""
        net = Network(height='800px', width='100%', notebook=False, bgcolor='#ffffff')

        # 颜色转换函数
        def to_hex(rgba):
            return "#{:02x}{:02x}{:02x}".format(
                int(rgba[0] * 255), int(rgba[1] * 255), int(rgba[2] * 255)
            )

        # 添加节点
        for node in graph.nodes:
            net.add_node(
                node,
                label=node,
                color=to_hex(self._region_color(regions[node])),
                size=15 * math.log(graph.nodes[node]['weight'] + 1),
                title=f"关键词: {node}<br>权重: {graph.nodes[node]['weight']:.2f}"
            )

        # 添加边
        for u, v in graph.edges:
            net.add_edge(
                u, v,
                width=0.3 * math.log(graph[u][v]['weight'] + 1),
                color='rgba(128,128,128,0.3)',  # PyVis支持CSS颜色格式
                title=f"关联强度: {graph[u][v]['weight']:.2f}"
            )

        # 物理引擎配置
        net.repulsion(
            node_distance=200,
            central_gravity=0.3,
            spring_length=150,
            damping=0.9
        )

        # 添加控制面板
        net.show_buttons(filter_=['physics', 'nodes', 'edges'])
        net.save_graph(output_html)

    def process_text(self, file_path):
        """完整处理流程"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"文件路径不存在:{file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # 文本处理
        words = self._text_segment(text)
        if len(words) < 10:
            raise ValueError("文本内容过短,需至少包含10个有效词汇")

        # 构建网络
        graph = self._build_cooccurrence_graph(words)
        regions = self._classify_regions(graph)

        return graph, regions


# 使用示例
if __name__ == "__main__":
    # 输入输出配置
    input_file = r"D:\daku\1980.txt"
    static_output = r"D:\daku\static_vis.png"
    interactive_output = r"D:\daku\interactive_vis.html"

    # 初始化处理器
    processor = PathVisualizer()

    try:
        # 处理文本
        graph, regions = processor.process_text(input_file)

        # 生成两种可视化
        processor.visualize_network(graph, static_output, regions)
        processor.generate_interactive_network(graph, regions, interactive_output)

        print(f"静态可视化已保存至:{static_output}")
        print(f"交互式可视化已保存至:{interactive_output}")

    except Exception as e:
        print(f"处理过程中发生错误:{str(e)}")
相关推荐
毕设源码-邱学长4 小时前
【开题答辩全过程】以 基于Java的学校住宿管理系统的设计与实现为例,包含答辩的问题和答案
java·开发语言
rookieﻬ°5 小时前
PHP框架漏洞
开发语言·php
猿界零零七5 小时前
pip install mxnet 报错解决方案
python·pip·mxnet
炸膛坦客5 小时前
单片机/C/C++八股:(二十)指针常量和常量指针
c语言·开发语言·c++
兑生6 小时前
【灵神题单·贪心】1481. 不同整数的最少数目 | 频率排序贪心 | Java
java·开发语言
炸膛坦客7 小时前
单片机/C/C++八股:(十九)栈和堆的区别?
c语言·开发语言·c++
零雲7 小时前
java面试:了解抽象类与接口么?讲一讲它们的区别
java·开发语言·面试
不只会拍照的程序猿7 小时前
《嵌入式AI筑基笔记02:Python数据类型01,从C的“硬核”到Python的“包容”》
人工智能·笔记·python
Jay_Franklin7 小时前
Quarto与Python集成使用
开发语言·python·markdown
2401_831824968 小时前
代码性能剖析工具
开发语言·c++·算法