LDA算法进行相似性分析

复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# 如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次
# nltk.download('punkt')
# nltk.download('stopwords')

# 数据预处理函数
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]  # 仅保留字母
    tokens = [word for word in tokens if word not in stop_words]  # 去除停用词
    return tokens

# 示例文档
documents = [
    "Text processing using LDA is interesting.",
    "Another document example for LDA.",
    "Text mining and natural language processing.",
    "LDA helps in topic modeling and finding patterns.",
    "This document is for testing LDA similarity."
]

# 数据预处理
texts = [preprocess(doc) for doc in documents]

# 创建词典
dictionary = corpora.Dictionary(texts)

# 转换为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]

# 训练LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# 对新文档进行主题分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)

# 获取原始文档的主题分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

# 计算新文档与每个原始文档的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):
    similarity = cossim(new_doc_topics, doc_topic)
    similarities.append((i, similarity))

# 输出相似性结果
print("Similarity between new document and each original document:")
for i, similarity in similarities:
    print(f"Document {i}: Similarity = {similarity}")

import gensim

from gensim import corpora

from gensim.models import LdaModel

from gensim.matutils import cossim

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

import string

如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次

nltk.download('punkt')

nltk.download('stopwords')

数据预处理函数

def preprocess(text):

stop_words = set(stopwords.words('english'))

tokens = word_tokenize(text.lower())

tokens = [word for word in tokens if word.isalpha()] # 仅保留字母

tokens = [word for word in tokens if word not in stop_words] # 去除停用词

return tokens

示例文档

documents = [

"Text processing using LDA is interesting.",

"Another document example for LDA.",

"Text mining and natural language processing.",

"LDA helps in topic modeling and finding patterns.",

"This document is for testing LDA similarity."

]

数据预处理

texts = [preprocess(doc) for doc in documents]

创建词典

dictionary = corpora.Dictionary(texts)

转换为词袋模型

corpus = [dictionary.doc2bow(text) for text in texts]

训练LDA模型

num_topics = 2

lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

对新文档进行主题分布提取

new_doc = "New text for testing similarity with LDA."

new_doc_preprocessed = preprocess(new_doc)

new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)

new_doc_topics = lda_model.get_document_topics(new_doc_bow)

获取原始文档的主题分布

doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

计算新文档与每个原始文档的相似性

similarities = []

for i, doc_topic in enumerate(doc_topics):

similarity = cossim(new_doc_topics, doc_topic)

similarities.append((i, similarity))

输出相似性结果

print("Similarity between new document and each original document:")

for i, similarity in similarities:

print(f"Document {i}: Similarity = {similarity}")

相关推荐
晨星shine1 天前
GC、Dispose、Unmanaged Resource 和 Managed Resource
后端·c#
用户298698530142 天前
.NET 文档自动化:Spire.Doc 设置奇偶页页眉/页脚的最佳实践
后端·c#·.net
用户3667462526742 天前
接口文档汇总 - 2.设备状态管理
c#
用户3667462526742 天前
接口文档汇总 - 3.PLC通信管理
c#
Ray Liang3 天前
用六边形架构与整洁架构对比是伪命题?
java·python·c#·架构设计
Scout-leaf6 天前
WPF新手村教程(三)—— 路由事件
c#·wpf
用户298698530146 天前
程序员效率工具:Spire.Doc如何助你一键搞定Word表格排版
后端·c#·.net
mudtools7 天前
搭建一套.net下能落地的飞书考勤系统
后端·c#·.net
玩泥巴的7 天前
搭建一套.net下能落地的飞书考勤系统
c#·.net·二次开发·飞书
唐宋元明清21887 天前
.NET 本地Db数据库-技术方案选型
windows·c#