LDA算法进行相似性分析

复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# 如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次
# nltk.download('punkt')
# nltk.download('stopwords')

# 数据预处理函数
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]  # 仅保留字母
    tokens = [word for word in tokens if word not in stop_words]  # 去除停用词
    return tokens

# 示例文档
documents = [
    "Text processing using LDA is interesting.",
    "Another document example for LDA.",
    "Text mining and natural language processing.",
    "LDA helps in topic modeling and finding patterns.",
    "This document is for testing LDA similarity."
]

# 数据预处理
texts = [preprocess(doc) for doc in documents]

# 创建词典
dictionary = corpora.Dictionary(texts)

# 转换为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]

# 训练LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# 对新文档进行主题分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)

# 获取原始文档的主题分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]

# 计算新文档与每个原始文档的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):
    similarity = cossim(new_doc_topics, doc_topic)
    similarities.append((i, similarity))

# 输出相似性结果
print("Similarity between new document and each original document:")
for i, similarity in similarities:
    print(f"Document {i}: Similarity = {similarity}")

import gensim

from gensim import corpora

from gensim.models import LdaModel

from gensim.matutils import cossim

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

import string

如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次

nltk.download('punkt')

nltk.download('stopwords')

数据预处理函数

def preprocess(text):

stop_words = set(stopwords.words('english'))

tokens = word_tokenize(text.lower())

tokens = word for word in tokens if word.isalpha() # 仅保留字母

tokens = word for word in tokens if word not in stop_words # 去除停用词

return tokens

示例文档

documents = [

"Text processing using LDA is interesting.",

"Another document example for LDA.",

"Text mining and natural language processing.",

"LDA helps in topic modeling and finding patterns.",

"This document is for testing LDA similarity."

]

数据预处理

texts = preprocess(doc) for doc in documents

创建词典

dictionary = corpora.Dictionary(texts)

转换为词袋模型

corpus = dictionary.doc2bow(text) for text in texts

训练LDA模型

num_topics = 2

lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

对新文档进行主题分布提取

new_doc = "New text for testing similarity with LDA."

new_doc_preprocessed = preprocess(new_doc)

new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)

new_doc_topics = lda_model.get_document_topics(new_doc_bow)

获取原始文档的主题分布

doc_topics = lda_model.get_document_topics(doc_bow) for doc_bow in corpus

计算新文档与每个原始文档的相似性

similarities = \[\]

for i, doc_topic in enumerate(doc_topics):

similarity = cossim(new_doc_topics, doc_topic)

similarities.append((i, similarity))

输出相似性结果

print("Similarity between new document and each original document:")

for i, similarity in similarities:

print(f"Document {i}: Similarity = {similarity}")

相关推荐
huangdong_11 分钟前
1688商品图片采集技术解析:登录态处理与SKU图自动分类
开发语言
chase_my_dream23 分钟前
C++ + SLAM 高频面试问题整理
开发语言·c++·面试
Cloud_Shy6181 小时前
解读《Effective Python 3rd Edition》:从练气到老魔(第五章 Item 30 - 32)
开发语言·人工智能·笔记·python·学习方法
天佑木枫1 小时前
15天Python入门系列 · 序
开发语言·python
宋拾壹2 小时前
同时添加多个类目
android·开发语言·javascript
凡人叶枫3 小时前
Effective C++ 条款04:确定对象被使用前已先被初始化
java·linux·开发语言·c++·嵌入式开发
小小龙学IT3 小时前
Go 语言后端开发:从并发模型到生产落地的工程实践
开发语言·后端·golang
ytttr8733 小时前
Qt 数字键盘实现
开发语言·qt
wearegogog1233 小时前
C# .NET 文件比较工具 WinForms
开发语言·c#·.net
再写一行代码就下班3 小时前
Cursor配置Java环境、创建Spring Boot项目的步骤
java·开发语言·spring boot