LDA算法进行相似性分析

复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# 如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次
# nltk.download('punkt')
# nltk.download('stopwords')

# 数据预处理函数
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]  # 仅保留字母
    tokens = [word for word in tokens if word not in stop_words]  # 去除停用词
    return tokens

# 示例文档
documents = [
    "Text processing using LDA is interesting.",
    "Another document example for LDA.",
    "Text mining and natural language processing.",
    "LDA helps in topic modeling and finding patterns.",
    "This document is for testing LDA similarity."
]

# 数据预处理
texts = [preprocess(doc) for doc in documents]

# 创建词典
dictionary = corpora.Dictionary(texts)

# 转换为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]

# 训练LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# 对新文档进行主题分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)

# 获取原始文档的主题分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

# 计算新文档与每个原始文档的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):
    similarity = cossim(new_doc_topics, doc_topic)
    similarities.append((i, similarity))

# 输出相似性结果
print("Similarity between new document and each original document:")
for i, similarity in similarities:
    print(f"Document {i}: Similarity = {similarity}")

import gensim

from gensim import corpora

from gensim.models import LdaModel

from gensim.matutils import cossim

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

import string

如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次

nltk.download('punkt')

nltk.download('stopwords')

数据预处理函数

def preprocess(text):

stop_words = set(stopwords.words('english'))

tokens = word_tokenize(text.lower())

tokens = [word for word in tokens if word.isalpha()] # 仅保留字母

tokens = [word for word in tokens if word not in stop_words] # 去除停用词

return tokens

示例文档

documents = [

"Text processing using LDA is interesting.",

"Another document example for LDA.",

"Text mining and natural language processing.",

"LDA helps in topic modeling and finding patterns.",

"This document is for testing LDA similarity."

]

数据预处理

texts = [preprocess(doc) for doc in documents]

创建词典

dictionary = corpora.Dictionary(texts)

转换为词袋模型

corpus = [dictionary.doc2bow(text) for text in texts]

训练LDA模型

num_topics = 2

lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

对新文档进行主题分布提取

new_doc = "New text for testing similarity with LDA."

new_doc_preprocessed = preprocess(new_doc)

new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)

new_doc_topics = lda_model.get_document_topics(new_doc_bow)

获取原始文档的主题分布

doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

计算新文档与每个原始文档的相似性

similarities = []

for i, doc_topic in enumerate(doc_topics):

similarity = cossim(new_doc_topics, doc_topic)

similarities.append((i, similarity))

输出相似性结果

print("Similarity between new document and each original document:")

for i, similarity in similarities:

print(f"Document {i}: Similarity = {similarity}")

相关推荐
香蕉可乐荷包蛋3 分钟前
Python学习之路(十三)-常用函数的使用,及优化
开发语言·python·学习
惜.己11 分钟前
使用python的读取xml文件,简单的处理成元组数组
xml·开发语言·python·测试工具
apihz34 分钟前
域名WHOIS信息查询免费API使用指南
android·开发语言·数据库·网络协议·tcp/ip
coding随想1 小时前
掌控网页的魔法之书:JavaScript DOM的奇幻之旅
开发语言·javascript·ecmascript
爱吃烤鸡翅的酸菜鱼1 小时前
IDEA高效开发:Database Navigator插件安装与核心使用指南
java·开发语言·数据库·编辑器·intellij-idea·database
神仙别闹1 小时前
基于C#+SQL Server实现(Web)学生选课管理系统
前端·数据库·c#
心情好的小球藻2 小时前
Python应用进阶DAY9--类型注解Type Hinting
开发语言·python
惜.己2 小时前
使用python读取json数据,简单的处理成元组数组
开发语言·python·测试工具·json
Y4090012 小时前
C语言转Java语言,相同与相异之处
java·c语言·开发语言·笔记
向宇it2 小时前
【unity组件介绍】URP Decal Projector贴花投影器,将特定材质(贴花)投影到场景中的其他对象上。
游戏·3d·unity·c#·游戏引擎·材质