LDA算法进行相似性分析

复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# 如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次
# nltk.download('punkt')
# nltk.download('stopwords')

# 数据预处理函数
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]  # 仅保留字母
    tokens = [word for word in tokens if word not in stop_words]  # 去除停用词
    return tokens

# 示例文档
documents = [
    "Text processing using LDA is interesting.",
    "Another document example for LDA.",
    "Text mining and natural language processing.",
    "LDA helps in topic modeling and finding patterns.",
    "This document is for testing LDA similarity."
]

# 数据预处理
texts = [preprocess(doc) for doc in documents]

# 创建词典
dictionary = corpora.Dictionary(texts)

# 转换为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]

# 训练LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# 对新文档进行主题分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)

# 获取原始文档的主题分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

# 计算新文档与每个原始文档的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):
    similarity = cossim(new_doc_topics, doc_topic)
    similarities.append((i, similarity))

# 输出相似性结果
print("Similarity between new document and each original document:")
for i, similarity in similarities:
    print(f"Document {i}: Similarity = {similarity}")

import gensim

from gensim import corpora

from gensim.models import LdaModel

from gensim.matutils import cossim

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

import string

如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次

nltk.download('punkt')

nltk.download('stopwords')

数据预处理函数

def preprocess(text):

stop_words = set(stopwords.words('english'))

tokens = word_tokenize(text.lower())

tokens = [word for word in tokens if word.isalpha()] # 仅保留字母

tokens = [word for word in tokens if word not in stop_words] # 去除停用词

return tokens

示例文档

documents = [

"Text processing using LDA is interesting.",

"Another document example for LDA.",

"Text mining and natural language processing.",

"LDA helps in topic modeling and finding patterns.",

"This document is for testing LDA similarity."

]

数据预处理

texts = [preprocess(doc) for doc in documents]

创建词典

dictionary = corpora.Dictionary(texts)

转换为词袋模型

corpus = [dictionary.doc2bow(text) for text in texts]

训练LDA模型

num_topics = 2

lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

对新文档进行主题分布提取

new_doc = "New text for testing similarity with LDA."

new_doc_preprocessed = preprocess(new_doc)

new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)

new_doc_topics = lda_model.get_document_topics(new_doc_bow)

获取原始文档的主题分布

doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

计算新文档与每个原始文档的相似性

similarities = []

for i, doc_topic in enumerate(doc_topics):

similarity = cossim(new_doc_topics, doc_topic)

similarities.append((i, similarity))

输出相似性结果

print("Similarity between new document and each original document:")

for i, similarity in similarities:

print(f"Document {i}: Similarity = {similarity}")

相关推荐
铭毅天下2 分钟前
EasySearch Rules 规则语法速查手册
开发语言·前端·javascript·ecmascript
YMWM_17 分钟前
print(f“{s!r}“)解释
开发语言·r语言
愤豆20 分钟前
05-Java语言核心-语法特性--模块化系统详解
java·开发语言·python
bksczm21 分钟前
文件流(fstream)
java·开发语言
NGC_661122 分钟前
Java 线程池阻塞队列与拒绝策略
java·开发语言
小碗羊肉34 分钟前
【从零开始学Java | 第二十二篇】List集合
java·开发语言
m0_7167652336 分钟前
C++提高编程--STL常用容器(set/multiset、map/multimap容器)详解
java·开发语言·c++·经验分享·学习·青少年编程·visual studio
froginwe111 小时前
Bootstrap4 折叠组件使用指南
开发语言
sunwenjian8861 小时前
SpringBean的生命周期
java·开发语言
毕设源码-赖学姐1 小时前
【开题答辩全过程】以 基于Java的游泳馆会员管理系统的设计与实现为例,包含答辩的问题和答案
java·开发语言