LDA算法进行相似性分析

复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# 如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次
# nltk.download('punkt')
# nltk.download('stopwords')

# 数据预处理函数
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]  # 仅保留字母
    tokens = [word for word in tokens if word not in stop_words]  # 去除停用词
    return tokens

# 示例文档
documents = [
    "Text processing using LDA is interesting.",
    "Another document example for LDA.",
    "Text mining and natural language processing.",
    "LDA helps in topic modeling and finding patterns.",
    "This document is for testing LDA similarity."
]

# 数据预处理
texts = [preprocess(doc) for doc in documents]

# 创建词典
dictionary = corpora.Dictionary(texts)

# 转换为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]

# 训练LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# 对新文档进行主题分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)

# 获取原始文档的主题分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

# 计算新文档与每个原始文档的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):
    similarity = cossim(new_doc_topics, doc_topic)
    similarities.append((i, similarity))

# 输出相似性结果
print("Similarity between new document and each original document:")
for i, similarity in similarities:
    print(f"Document {i}: Similarity = {similarity}")

import gensim

from gensim import corpora

from gensim.models import LdaModel

from gensim.matutils import cossim

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

import string

如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次

nltk.download('punkt')

nltk.download('stopwords')

数据预处理函数

def preprocess(text):

stop_words = set(stopwords.words('english'))

tokens = word_tokenize(text.lower())

tokens = [word for word in tokens if word.isalpha()] # 仅保留字母

tokens = [word for word in tokens if word not in stop_words] # 去除停用词

return tokens

示例文档

documents = [

"Text processing using LDA is interesting.",

"Another document example for LDA.",

"Text mining and natural language processing.",

"LDA helps in topic modeling and finding patterns.",

"This document is for testing LDA similarity."

]

数据预处理

texts = [preprocess(doc) for doc in documents]

创建词典

dictionary = corpora.Dictionary(texts)

转换为词袋模型

corpus = [dictionary.doc2bow(text) for text in texts]

训练LDA模型

num_topics = 2

lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

对新文档进行主题分布提取

new_doc = "New text for testing similarity with LDA."

new_doc_preprocessed = preprocess(new_doc)

new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)

new_doc_topics = lda_model.get_document_topics(new_doc_bow)

获取原始文档的主题分布

doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

计算新文档与每个原始文档的相似性

similarities = []

for i, doc_topic in enumerate(doc_topics):

similarity = cossim(new_doc_topics, doc_topic)

similarities.append((i, similarity))

输出相似性结果

print("Similarity between new document and each original document:")

for i, similarity in similarities:

print(f"Document {i}: Similarity = {similarity}")

相关推荐
爱滑雪的码农18 小时前
Java基础十一 流(Stream)、文件(File)和IO
java·开发语言·python
叶小鸡18 小时前
Java 篇-项目实战-天机学堂(从0到1)-day11
java·开发语言
格林威18 小时前
线阵工业相机:线阵图像出现“波浪纹”,是机械振动还是编码器问题?
开发语言·人工智能·数码相机·计算机视觉·视觉检测·工业相机·线阵相机
liliangcsdn18 小时前
LLM如何辅助RAG从大量文档中筛选目标文档
开发语言·人工智能
无忧.芙桃18 小时前
C++IO库的超详细讲解
开发语言·c++
朗迹 - 张伟18 小时前
用AI开发QT——Qt与Trae开发环境搭建
开发语言·qt·策略模式
雨辰AI18 小时前
从 MySQL 迁移至人大金仓 V9 完整改造指南|分页 / 函数 / 语法兼容全部解决
java·开发语言·数据库·后端·mysql·政务
MATLAB代码顾问18 小时前
改进鲸鱼优化算法(IWOA)求解柔性作业车间调度问题(FJSP)——附MATLAB代码
开发语言·算法·matlab
阿里嘎多学长19 小时前
2026-05-03 GitHub 热点项目精选
开发语言·程序员·github·代码托管