小说文本分析工具:基于streamlit实现的文本分析

小说文本分析工具:基于streamlit实现的文本分析

主要在于使用python对小说文本中章节之间的识别与分割,通过分词以及停用词库,抽取关键词章节的词云展示,以及关键词在整个文本当中的权重网络。

复制代码
import re
import streamlit as st
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
from collections import Counter
import chardet
import numpy as np
import networkx as nx
import os
from sklearn.feature_extraction.text import TfidfVectorizer

# =====================
# 全局配置
# =====================
DEFAULT_STOPWORDS_PATH = r"D:\daku\小说图谱\stopwords.txt"
MAX_FILE_SIZE = 200  # MB
BACKGROUND_COLOR = "#0E1117"
TEXT_COLOR = "#FFFFFF"

# =====================
# 初始化设置
# =====================
st.set_page_config(
    page_title="小说文本分析工具",
    page_icon="📚",
    layout="wide",
    initial_sidebar_state="expanded"
)
jieba.initialize()


# =====================
# 核心功能模块
# =====================
@st.cache_data
def split_chapters(content):
    """增强型章节分割"""
    patterns = [
        r'(第[零一二三四五六七八九十百千万\d]+章\s*[^\n]*)',
        r'(【.*?】)\s*',
        r'(<h1>.*?</h1>)\s*'
    ]

    # 动态生成复合正则表达式
    matches = []
    for pattern in patterns:
        for match in re.finditer(pattern, content, flags=re.MULTILINE):
            start_pos = match.start()
            full_title = match.group(0).strip()
            matches.append((start_pos, full_title))

    # 按位置排序并去重
    matches = sorted(list({x[0]: x for x in matches}.values()), key=lambda x: x[0])

    chapters = []
    prev_end = 0

    # 处理前言部分
    if matches and matches[0][0] > 0:
        chapters.append(("前言", content[0:matches[0][0]].strip()))

    # 分割章节内容
    for i in range(len(matches)):
        start_pos, title = matches[i]
        end_pos = matches[i + 1][0] if i < len(matches) - 1 else len(content)
        chapter_content = content[start_pos:end_pos].strip()

        # 过滤空内容章节
        if len(chapter_content) > 10:  # 至少包含10个字符
            chapters.append((title, chapter_content))

    # 处理无章节情况
    if not chapters:
        chapters = [("全文", content.strip())]

    return chapters


@st.cache_data
def calculate_jaccard_similarity(keyword_data, top_n=10):
    """基于前N关键词的Jaccard相似度计算"""
    all_keywords = set()
    keyword_sets = []

    for _, keywords in keyword_data:
        chapter_keywords = set([k for k, _ in keywords[:top_n]])
        keyword_sets.append(chapter_keywords)
        all_keywords.update(chapter_keywords)

    similarity_matrix = np.zeros((len(keyword_sets), len(keyword_sets)))
    for i in range(len(keyword_sets)):
        for j in range(i + 1, len(keyword_sets)):
            intersection = len(keyword_sets[i] & keyword_sets[j])
            union = len(keyword_sets[i] | keyword_sets[j])
            similarity_matrix[i][j] = similarity_matrix[j][i] = intersection / union if union != 0 else 0

    return similarity_matrix


# =====================
# 可视化模块
# =====================
def generate_dark_wordcloud(counter):
    """深色背景词云生成"""
    wc = WordCloud(
        font_path="simhei.ttf",
        width=800,
        height=400,
        background_color=BACKGROUND_COLOR,
        colormap='viridis',
        max_words=50,
        contour_color=TEXT_COLOR
    ).generate_from_frequencies(counter)

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.imshow(wc, interpolation='bilinear')
    ax.axis("off")
    return fig


def draw_network_graph(similarity_matrix, labels, threshold=0.3):
    """符合设计图的网络关系图"""
    G = nx.Graph()

    # 添加节点和边
    for i, label in enumerate(labels):
        G.add_node(label[:12], size=800)
        for j in range(i + 1, len(labels)):
            if similarity_matrix[i][j] > threshold:
                G.add_edge(label[:12], labels[j][:12], weight=similarity_matrix[i][j])

    # 可视化参数
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=0.8)

    # 绘制节点
    nx.draw_networkx_nodes(
        G, pos,
        node_size=1200,
        node_color="#4B8BBE",
        alpha=0.9
    )

    # 绘制边
    edges = G.edges(data=True)
    nx.draw_networkx_edges(
        G, pos,
        edgelist=edges,
        width=[d['weight'] * 3 for _, _, d in edges],
        edge_color="#7F7F7F",
        alpha=0.6
    )

    # 节点标签
    nx.draw_networkx_labels(
        G, pos,
        font_size=10,
        font_family='SimHei',
        font_color=TEXT_COLOR
    )

    # 边权重标签
    edge_labels = {(u, v): f"{d['weight']:.2f}" for u, v, d in edges if d['weight'] > 0.3}
    nx.draw_networkx_edge_labels(
        G, pos,
        edge_labels=edge_labels,
        font_color="#FF4B4B"
    )

    plt.axis('off')
    return plt


# =====================
# 主界面实现
# =====================
def main():
    # 页面样式
    st.markdown(f"""
        <style>
            .reportview-container {{
                background: {BACKGROUND_COLOR};
                color: {TEXT_COLOR};
            }}
            .sidebar .sidebar-content {{
                background: {BACKGROUND_COLOR};
                border-right: 1px solid #2e2e2e;
            }}
            .st-bq {{
                color: {TEXT_COLOR} !important;
            }}
        </style>
    """, unsafe_allow_html=True)

    # 侧边栏设置
    with st.sidebar:
        st.header("⚙️ 设置")
        uploaded_file = st.file_uploader(
            "上传小说文件",
            type=['txt'],
            help="最大文件尺寸:200MB"
        )
        threshold = st.slider("关系阈值", 0.0, 1.0, 0.75, 0.05)
        num_keywords = st.slider("关键词数量", 10, 50, 10)

    # 主内容区
    st.title("小说文本分析工具")

    if uploaded_file:
        try:
            # 文件大小验证
            if uploaded_file.size > MAX_FILE_SIZE * 1024 * 1024:
                st.error(f"文件大小超过{MAX_FILE_SIZE}MB限制")
                return

            # 文件编码检测
            raw_data = uploaded_file.getvalue()
            encoding = chardet.detect(raw_data)['encoding']
            content = raw_data.decode(encoding or 'utf-8', errors='replace')

            st.write(f"Detected encoding: {encoding}")
            st.write(f"File content preview: {content[:500]}...")

            # 章节分割
            chapters = split_chapters(content)
            if not chapters:
                st.error("未能识别到任何章节内容")
                return

            st.write(f"Chapters detected: {[title for title, _ in chapters]}")

            # 加载停用词
            stopwords = set()
            if os.path.exists(DEFAULT_STOPWORDS_PATH):
                with open(DEFAULT_STOPWORDS_PATH, 'r', encoding='utf-8') as f:
                    stopwords = set(line.strip() for line in f if line.strip())
            else:
                st.warning(f"未找到停用词文件:{DEFAULT_STOPWORDS_PATH}")

            # 关键词分析
            keyword_data = []
            with st.spinner('分析中...'):
                for title, text in chapters:
                    # 清理文本中的换行符和其他特殊字符
                    cleaned_text = re.sub(r'\s+', ' ', text)

                    # 分词处理
                    words = [word for word in jieba.lcut(cleaned_text)
                             if len(word) > 1
                             and word not in stopwords
                             and not re.match(r'^\d+$', word)]
                    counter = Counter(words)
                    keyword_data.append((title, counter.most_common(num_keywords)))

                # 相似度矩阵计算
                if len(chapters) > 1:
                    similarity_matrix = calculate_jaccard_similarity(keyword_data, top_n=10)
                else:
                    similarity_matrix = np.zeros((1, 1))

            # 布局管理
            col1, col2 = st.columns([1, 1])

            with col1:
                st.subheader("章节关键词")
                selected_chapter = st.selectbox(
                    "选择章节",
                    options=[title for title, _ in chapters],
                    index=0
                )
                idx = [title for title, _ in chapters].index(selected_chapter)
                try:
                    st.pyplot(generate_dark_wordcloud(dict(keyword_data[idx][1])))
                except Exception as e:
                    st.error(f"生成词云失败: {str(e)}")

            with col2:
                st.subheader("章节关系网络")
                if len(chapters) > 1:
                    plt = draw_network_graph(similarity_matrix, [title for title, _ in chapters], threshold)
                    st.pyplot(plt)
                else:
                    st.info("需要至少两个章节生成关系网络")

            # 分析报告
            with st.expander("📊 分析详情"):
                report_data = {
                    "文件名": uploaded_file.name,
                    "文件大小": f"{uploaded_file.size / 1024:.1f} KB",
                    "识别章节数": len(chapters),
                    "总字数": sum(len(text) for _, text in chapters),
                    "平均章节长度": f"{sum(len(text) for _, text in chapters) / len(chapters):.0f} 字",
                    "高频关键词": "、".join([k for k, _ in keyword_data[0][1][:5]])
                }
                st.table(report_data)

        except Exception as e:
            st.error(f"处理失败: {str(e)}")
            st.error("请检查文件格式是否符合要求(UTF-8/GBK编码的文本文件)")
    else:
        st.info("👈 请上传小说文件开始分析")


if __name__ == "__main__":
    main()
相关推荐
MATLAB代码顾问6 分钟前
5大智能算法优化标准测试函数对比(Python实现)
开发语言·python
ting945200010 分钟前
Tornado 全栈技术深度指南:从原理到实战
人工智能·python·架构·tornado
果汁华33 分钟前
Browserbase Skills:让 Claude Agent 真正“看见“网页世界
人工智能·python
ZhengEnCi33 分钟前
04-缩放点积注意力代码实现 💻
人工智能·python
云水一下34 分钟前
从零开始!VMware安装Fedora Workstation 44桌面系统完整教程
前端
DeepReinforce1 小时前
三、AI量化投资:使用akshare获取A股主板20260430所有的涨停股票
python·量化·akshare·龙头战法
段一凡-华北理工大学2 小时前
【高炉炼铁领域炉温监测、预警、调控智能体设计与应用】~系列文章08:多模态数据融合:让数据更聪明
人工智能·python·高炉炼铁·ai赋能·工业智能体·高炉炉温
万粉变现经纪人2 小时前
如何解决 pip install llama-cpp-python 报错 未安装 CMake/Ninja 或 CPU 不支持 AVX 问题
开发语言·python·开源·aigc·pip·ai写作·llama
小码哥_常2 小时前
安卓黑科技:实现多平台商品详情页一键跳转APP
前端
killerbasd2 小时前
还是迷茫 5.3
前端·react.js·前端框架