使用python自动提取文本关键词

使用殷本纪文字内容:汤出,见野张网四面,祝曰:"自天下四方皆入吾网。"汤曰:"嘻,尽之矣!"乃去其三面,祝曰:"欲左,左。欲右,右。不用命,乃入吾网。"诸侯闻之,曰:"汤德至矣,及禽兽。"........

复制代码
import jieba
import jieba.posseg as pseg
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from collections import defaultdict
import math
import os
from pyvis.network import Network
from networkx.algorithms import community


class PathVisualizer:
    def __init__(self, stopwords_path='stopwords.txt', window_size=5, damping=0.85):
        """初始化文本路径可视化器

        Args:
            stopwords_path: 停用词文件路径
            window_size: 共现窗口大小
            damping: 随机游走阻尼系数
        """
        self.window_size = window_size
        self.damping = damping
        self.stopwords = self._load_stopwords(stopwords_path)
        jieba.initialize()
        self.pos_filter = {'n', 'v', 'a', 'x'}  # 包含专业术语的词性过滤

    def _load_stopwords(self, path):
        """加载停用词表"""
        default_stopwords = {'的', '了', '在', '是', '和', '就', '不'}
        try:
            with open(path, 'r', encoding='utf-8') as f:
                return set([line.strip() for line in f]) | default_stopwords
        except FileNotFoundError:
            print(f"警告:未找到停用词文件 {path},使用默认停用词")
            return default_stopwords

    def _text_segment(self, text):
        """专业文本预处理"""
        words = []
        for word, pos in pseg.cut(text):
            if pos[0].lower() in self.pos_filter or 'x' in pos.lower():
                if word not in self.stopwords and len(word) > 1:
                    words.append(word)
        return words

    def _build_cooccurrence_graph(self, words):
        """构建多目标共现网络"""
        graph = nx.DiGraph()
        node_weights = defaultdict(int)

        for i in range(len(words)):
            window_start = max(0, i - self.window_size)
            current_word = words[i]
            node_weights[current_word] += 1 / (i + 1)  # 位置加权

            # 创建共现关系
            for j in range(window_start, i):
                prev_word = words[j]
                weight = 1 / (i - j)  # 距离衰减因子
                if graph.has_edge(prev_word, current_word):
                    graph[prev_word][current_word]['weight'] += weight
                else:
                    graph.add_edge(prev_word, current_word, weight=weight)

        # 添加节点属性
        nx.set_node_attributes(graph, dict(node_weights), 'weight')
        return graph

    def _classify_regions(self, graph):
        """使用Louvain算法进行社区划分"""
        undi_graph = graph.to_undirected()
        communities = community.louvain_communities(undi_graph, resolution=0.8)
        regions = {}
        for i, comm in enumerate(communities):
            for node in comm:
                regions[node] = chr(65 + i)  # 按社区编号转为字母
        return regions

    def visualize_network(self, graph, output_path, regions):
        """静态可视化方法"""
        plt.figure(figsize=(24, 14))

        # 优化布局算法
        pos = nx.kamada_kawai_layout(graph, scale=2)

        # 动态调整可视化参数
        node_sizes = [800 * math.log(graph.nodes[node]['weight'] + 1) for node in graph.nodes]
        edge_widths = [0.3 + 0.5 * math.log(graph[u][v]['weight'] + 1) for u, v in graph.edges]

        # 绘制节点(使用修正后的颜色格式)
        nx.draw_networkx_nodes(
            graph, pos,
            node_size=node_sizes,
            node_color=[self._region_color(regions[node]) for node in graph.nodes],
            edgecolors=(1, 1, 1, 0.8),  # 白色带透明度
            alpha=0.7
        )

        # 绘制边(使用RGBA元组格式)
        nx.draw_networkx_edges(
            graph, pos,
            width=edge_widths,
            edge_color=(0.5, 0.5, 0.5, 0.4),  # RGB(128,128,128)转换为归一化值
            arrowsize=15,
            alpha=0.4
        )

        # 添加智能标签
        self._draw_smart_labels(graph, pos)

        # 添加图例和注释
        self._add_legends()
        self._add_community_annotations(pos, regions)

        plt.axis('off')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()

    def _region_color(self, region):
        """区域颜色映射(使用Matplotlib兼容格式)"""
        colors = {
            'A': (0.18, 0.31, 0.31, 0.8),  # #2F4F4F -> RGBA
            'B': (0.47, 0.53, 0.60, 0.7),  # #778899
            'C': (0.66, 0.66, 0.66, 0.6),  # #A9A9A9
            'D': (0.83, 0.83, 0.83, 0.5),  # #D3D3D3
            'E': (0.94, 0.97, 1.00, 0.4)  # #F0F8FF
        }
        return colors.get(region, (1, 1, 1, 1))

    def _draw_smart_labels(self, graph, pos):
        """智能标签显示策略"""
        weights = np.array(list(nx.get_node_attributes(graph, 'weight').values()))
        threshold = np.percentile(weights, 75)

        # 仅标注重要节点
        labels = {node: node for node in graph.nodes if graph.nodes[node]['weight'] > threshold}
        nx.draw_networkx_labels(
            graph, pos, labels,
            font_size=10,
            font_family='SimHei',
            alpha=0.9,
            bbox=dict(facecolor=(1, 1, 1, 0.7), edgecolor='none')  # 半透明白色背景
        )

    def _add_legends(self):
        """添加自定义图例"""
        legend_elements = [
            plt.Line2D([0], [0], marker='o', color='w', label='核心节点',
                       markerfacecolor=(0.18, 0.31, 0.31, 0.8), markersize=15),
            plt.Line2D([0], [0], marker='o', color='w', label='次要节点',
                       markerfacecolor=(0.47, 0.53, 0.60, 0.7), markersize=12),
            plt.Line2D([0], [0], color=(0.5, 0.5, 0.5, 0.4), lw=3, label='关联强度')
        ]
        plt.legend(handles=legend_elements, loc='upper left', fontsize=12, framealpha=0.9)

    def _add_community_annotations(self, pos, regions):
        """添加社区标注(使用正确颜色格式)"""
        region_centers = defaultdict(list)
        for node, reg in regions.items():
            region_centers[reg].append(pos[node])

        for reg, coords in region_centers.items():
            x, y = zip(*coords)
            center = (np.mean(x), np.mean(y))
            plt.annotate(
                f'社区 {reg}',
                xy=center,
                xytext=(center[0], center[1] + 0.15),
                ha='center',
                fontsize=14,
                weight='bold',
                color=(0.80, 0.40, 0.40, 0.9),  # #FF6B6B -> RGBA
                arrowprops=dict(arrowstyle='->', color=(0.80, 0.40, 0.40, 0.7))
            )

    def generate_interactive_network(self, graph, regions, output_html):
        """生成交互式可视化(保持原始颜色格式)"""
        net = Network(height='800px', width='100%', notebook=False, bgcolor='#ffffff')

        # 颜色转换函数
        def to_hex(rgba):
            return "#{:02x}{:02x}{:02x}".format(
                int(rgba[0] * 255), int(rgba[1] * 255), int(rgba[2] * 255)
            )

        # 添加节点
        for node in graph.nodes:
            net.add_node(
                node,
                label=node,
                color=to_hex(self._region_color(regions[node])),
                size=15 * math.log(graph.nodes[node]['weight'] + 1),
                title=f"关键词: {node}<br>权重: {graph.nodes[node]['weight']:.2f}"
            )

        # 添加边
        for u, v in graph.edges:
            net.add_edge(
                u, v,
                width=0.3 * math.log(graph[u][v]['weight'] + 1),
                color='rgba(128,128,128,0.3)',  # PyVis支持CSS颜色格式
                title=f"关联强度: {graph[u][v]['weight']:.2f}"
            )

        # 物理引擎配置
        net.repulsion(
            node_distance=200,
            central_gravity=0.3,
            spring_length=150,
            damping=0.9
        )

        # 添加控制面板
        net.show_buttons(filter_=['physics', 'nodes', 'edges'])
        net.save_graph(output_html)

    def process_text(self, file_path):
        """完整处理流程"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"文件路径不存在:{file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # 文本处理
        words = self._text_segment(text)
        if len(words) < 10:
            raise ValueError("文本内容过短,需至少包含10个有效词汇")

        # 构建网络
        graph = self._build_cooccurrence_graph(words)
        regions = self._classify_regions(graph)

        return graph, regions


# 使用示例
if __name__ == "__main__":
    # 输入输出配置
    input_file = r"D:\daku\1980.txt"
    static_output = r"D:\daku\static_vis.png"
    interactive_output = r"D:\daku\interactive_vis.html"

    # 初始化处理器
    processor = PathVisualizer()

    try:
        # 处理文本
        graph, regions = processor.process_text(input_file)

        # 生成两种可视化
        processor.visualize_network(graph, static_output, regions)
        processor.generate_interactive_network(graph, regions, interactive_output)

        print(f"静态可视化已保存至:{static_output}")
        print(f"交互式可视化已保存至:{interactive_output}")

    except Exception as e:
        print(f"处理过程中发生错误:{str(e)}")
相关推荐
fqbqrr2 小时前
2606C++,C++构的多态
开发语言·c++
biter down3 小时前
从 0 到 1 搭建 Python 接口自动化测试框架(博客系统实战)
开发语言·python
肖永威4 小时前
Python多业务并行计算框架插件化演进:从硬编码到动态注册
python·插件化·并行计算·动态注册
yz_aiks4 小时前
Linux Jar包配置Systemd自启动实战:从排查到配置全流程
linux·python·jar·自启动·systemd
threelab5 小时前
Three.js 物理模拟着色器 | 三维可视化 / AI 提示词
开发语言·前端·javascript·人工智能·3d·着色器
武器大师725 小时前
lv_binding_js 代码解读
开发语言·javascript·ecmascript
不知名的老吴5 小时前
线程的生命周期之线程“插队“
java·开发语言·python
kaikaile19955 小时前
数字全息图处理系统(C# 实现)
开发语言·c#
xsc6996756 小时前
从零搭建大模型与智能体平台 - 完整技术详解
python