使用python自动提取文本关键词

使用殷本纪文字内容:汤出,见野张网四面,祝曰:"自天下四方皆入吾网。"汤曰:"嘻,尽之矣!"乃去其三面,祝曰:"欲左,左。欲右,右。不用命,乃入吾网。"诸侯闻之,曰:"汤德至矣,及禽兽。"........

复制代码
import jieba
import jieba.posseg as pseg
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from collections import defaultdict
import math
import os
from pyvis.network import Network
from networkx.algorithms import community


class PathVisualizer:
    def __init__(self, stopwords_path='stopwords.txt', window_size=5, damping=0.85):
        """初始化文本路径可视化器

        Args:
            stopwords_path: 停用词文件路径
            window_size: 共现窗口大小
            damping: 随机游走阻尼系数
        """
        self.window_size = window_size
        self.damping = damping
        self.stopwords = self._load_stopwords(stopwords_path)
        jieba.initialize()
        self.pos_filter = {'n', 'v', 'a', 'x'}  # 包含专业术语的词性过滤

    def _load_stopwords(self, path):
        """加载停用词表"""
        default_stopwords = {'的', '了', '在', '是', '和', '就', '不'}
        try:
            with open(path, 'r', encoding='utf-8') as f:
                return set([line.strip() for line in f]) | default_stopwords
        except FileNotFoundError:
            print(f"警告:未找到停用词文件 {path},使用默认停用词")
            return default_stopwords

    def _text_segment(self, text):
        """专业文本预处理"""
        words = []
        for word, pos in pseg.cut(text):
            if pos[0].lower() in self.pos_filter or 'x' in pos.lower():
                if word not in self.stopwords and len(word) > 1:
                    words.append(word)
        return words

    def _build_cooccurrence_graph(self, words):
        """构建多目标共现网络"""
        graph = nx.DiGraph()
        node_weights = defaultdict(int)

        for i in range(len(words)):
            window_start = max(0, i - self.window_size)
            current_word = words[i]
            node_weights[current_word] += 1 / (i + 1)  # 位置加权

            # 创建共现关系
            for j in range(window_start, i):
                prev_word = words[j]
                weight = 1 / (i - j)  # 距离衰减因子
                if graph.has_edge(prev_word, current_word):
                    graph[prev_word][current_word]['weight'] += weight
                else:
                    graph.add_edge(prev_word, current_word, weight=weight)

        # 添加节点属性
        nx.set_node_attributes(graph, dict(node_weights), 'weight')
        return graph

    def _classify_regions(self, graph):
        """使用Louvain算法进行社区划分"""
        undi_graph = graph.to_undirected()
        communities = community.louvain_communities(undi_graph, resolution=0.8)
        regions = {}
        for i, comm in enumerate(communities):
            for node in comm:
                regions[node] = chr(65 + i)  # 按社区编号转为字母
        return regions

    def visualize_network(self, graph, output_path, regions):
        """静态可视化方法"""
        plt.figure(figsize=(24, 14))

        # 优化布局算法
        pos = nx.kamada_kawai_layout(graph, scale=2)

        # 动态调整可视化参数
        node_sizes = [800 * math.log(graph.nodes[node]['weight'] + 1) for node in graph.nodes]
        edge_widths = [0.3 + 0.5 * math.log(graph[u][v]['weight'] + 1) for u, v in graph.edges]

        # 绘制节点(使用修正后的颜色格式)
        nx.draw_networkx_nodes(
            graph, pos,
            node_size=node_sizes,
            node_color=[self._region_color(regions[node]) for node in graph.nodes],
            edgecolors=(1, 1, 1, 0.8),  # 白色带透明度
            alpha=0.7
        )

        # 绘制边(使用RGBA元组格式)
        nx.draw_networkx_edges(
            graph, pos,
            width=edge_widths,
            edge_color=(0.5, 0.5, 0.5, 0.4),  # RGB(128,128,128)转换为归一化值
            arrowsize=15,
            alpha=0.4
        )

        # 添加智能标签
        self._draw_smart_labels(graph, pos)

        # 添加图例和注释
        self._add_legends()
        self._add_community_annotations(pos, regions)

        plt.axis('off')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()

    def _region_color(self, region):
        """区域颜色映射(使用Matplotlib兼容格式)"""
        colors = {
            'A': (0.18, 0.31, 0.31, 0.8),  # #2F4F4F -> RGBA
            'B': (0.47, 0.53, 0.60, 0.7),  # #778899
            'C': (0.66, 0.66, 0.66, 0.6),  # #A9A9A9
            'D': (0.83, 0.83, 0.83, 0.5),  # #D3D3D3
            'E': (0.94, 0.97, 1.00, 0.4)  # #F0F8FF
        }
        return colors.get(region, (1, 1, 1, 1))

    def _draw_smart_labels(self, graph, pos):
        """智能标签显示策略"""
        weights = np.array(list(nx.get_node_attributes(graph, 'weight').values()))
        threshold = np.percentile(weights, 75)

        # 仅标注重要节点
        labels = {node: node for node in graph.nodes if graph.nodes[node]['weight'] > threshold}
        nx.draw_networkx_labels(
            graph, pos, labels,
            font_size=10,
            font_family='SimHei',
            alpha=0.9,
            bbox=dict(facecolor=(1, 1, 1, 0.7), edgecolor='none')  # 半透明白色背景
        )

    def _add_legends(self):
        """添加自定义图例"""
        legend_elements = [
            plt.Line2D([0], [0], marker='o', color='w', label='核心节点',
                       markerfacecolor=(0.18, 0.31, 0.31, 0.8), markersize=15),
            plt.Line2D([0], [0], marker='o', color='w', label='次要节点',
                       markerfacecolor=(0.47, 0.53, 0.60, 0.7), markersize=12),
            plt.Line2D([0], [0], color=(0.5, 0.5, 0.5, 0.4), lw=3, label='关联强度')
        ]
        plt.legend(handles=legend_elements, loc='upper left', fontsize=12, framealpha=0.9)

    def _add_community_annotations(self, pos, regions):
        """添加社区标注(使用正确颜色格式)"""
        region_centers = defaultdict(list)
        for node, reg in regions.items():
            region_centers[reg].append(pos[node])

        for reg, coords in region_centers.items():
            x, y = zip(*coords)
            center = (np.mean(x), np.mean(y))
            plt.annotate(
                f'社区 {reg}',
                xy=center,
                xytext=(center[0], center[1] + 0.15),
                ha='center',
                fontsize=14,
                weight='bold',
                color=(0.80, 0.40, 0.40, 0.9),  # #FF6B6B -> RGBA
                arrowprops=dict(arrowstyle='->', color=(0.80, 0.40, 0.40, 0.7))
            )

    def generate_interactive_network(self, graph, regions, output_html):
        """生成交互式可视化(保持原始颜色格式)"""
        net = Network(height='800px', width='100%', notebook=False, bgcolor='#ffffff')

        # 颜色转换函数
        def to_hex(rgba):
            return "#{:02x}{:02x}{:02x}".format(
                int(rgba[0] * 255), int(rgba[1] * 255), int(rgba[2] * 255)
            )

        # 添加节点
        for node in graph.nodes:
            net.add_node(
                node,
                label=node,
                color=to_hex(self._region_color(regions[node])),
                size=15 * math.log(graph.nodes[node]['weight'] + 1),
                title=f"关键词: {node}<br>权重: {graph.nodes[node]['weight']:.2f}"
            )

        # 添加边
        for u, v in graph.edges:
            net.add_edge(
                u, v,
                width=0.3 * math.log(graph[u][v]['weight'] + 1),
                color='rgba(128,128,128,0.3)',  # PyVis支持CSS颜色格式
                title=f"关联强度: {graph[u][v]['weight']:.2f}"
            )

        # 物理引擎配置
        net.repulsion(
            node_distance=200,
            central_gravity=0.3,
            spring_length=150,
            damping=0.9
        )

        # 添加控制面板
        net.show_buttons(filter_=['physics', 'nodes', 'edges'])
        net.save_graph(output_html)

    def process_text(self, file_path):
        """完整处理流程"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"文件路径不存在:{file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # 文本处理
        words = self._text_segment(text)
        if len(words) < 10:
            raise ValueError("文本内容过短,需至少包含10个有效词汇")

        # 构建网络
        graph = self._build_cooccurrence_graph(words)
        regions = self._classify_regions(graph)

        return graph, regions


# 使用示例
if __name__ == "__main__":
    # 输入输出配置
    input_file = r"D:\daku\1980.txt"
    static_output = r"D:\daku\static_vis.png"
    interactive_output = r"D:\daku\interactive_vis.html"

    # 初始化处理器
    processor = PathVisualizer()

    try:
        # 处理文本
        graph, regions = processor.process_text(input_file)

        # 生成两种可视化
        processor.visualize_network(graph, static_output, regions)
        processor.generate_interactive_network(graph, regions, interactive_output)

        print(f"静态可视化已保存至:{static_output}")
        print(f"交互式可视化已保存至:{interactive_output}")

    except Exception as e:
        print(f"处理过程中发生错误:{str(e)}")
相关推荐
X56611 小时前
如何在 Laravel 中正确保存嵌套动态表单数据(主服务与子服务)
jvm·数据库·python
FQNmxDG4S1 小时前
Java多线程编程:Thread与Runnable的并发控制
java·开发语言
ZhengEnCi1 小时前
03ab-PyTorch安装教程 📚
python
前端老石人2 小时前
HTML 字符引用完全指南
开发语言·前端·html
matlab_xiaowang2 小时前
Redux 入门:JavaScript 可预测状态管理库
开发语言·javascript·其他·ecmascript
狐狐生风2 小时前
LangChain 向量存储:Chroma、FAISS
人工智能·python·学习·langchain·faiss·agentai
虹科网络安全2 小时前
艾体宝干货|数据复制详解:类型、原理与适用场景
java·开发语言·数据库
狐狐生风2 小时前
LangChain RAG 基础
人工智能·python·学习·langchain·rag·agentai
axng pmje2 小时前
Java语法进阶
java·开发语言·jvm
老前端的功夫3 小时前
【Java从入门到入土】28:Stream API:告别for循环的新时代
java·开发语言·python