import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
# 如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次
# nltk.download('punkt')
# nltk.download('stopwords')
# 数据预处理函数
def preprocess(text):
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalpha()] # 仅保留字母
tokens = [word for word in tokens if word not in stop_words] # 去除停用词
return tokens
# 示例文档
documents = [
"Text processing using LDA is interesting.",
"Another document example for LDA.",
"Text mining and natural language processing.",
"LDA helps in topic modeling and finding patterns.",
"This document is for testing LDA similarity."
]
# 数据预处理
texts = [preprocess(doc) for doc in documents]
# 创建词典
dictionary = corpora.Dictionary(texts)
# 转换为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]
# 训练LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
# 对新文档进行主题分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)
# 获取原始文档的主题分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]
# 计算新文档与每个原始文档的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):
similarity = cossim(new_doc_topics, doc_topic)
similarities.append((i, similarity))
# 输出相似性结果
print("Similarity between new document and each original document:")
for i, similarity in similarities:
print(f"Document {i}: Similarity = {similarity}")
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次
nltk.download('punkt')
nltk.download('stopwords')
数据预处理函数
def preprocess(text):
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalpha()] # 仅保留字母
tokens = [word for word in tokens if word not in stop_words] # 去除停用词
return tokens
示例文档
documents = [
"Text processing using LDA is interesting.",
"Another document example for LDA.",
"Text mining and natural language processing.",
"LDA helps in topic modeling and finding patterns.",
"This document is for testing LDA similarity."
]
数据预处理
texts = [preprocess(doc) for doc in documents]
创建词典
dictionary = corpora.Dictionary(texts)
转换为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]
训练LDA模型
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
对新文档进行主题分布提取
new_doc = "New text for testing similarity with LDA."
new_doc_preprocessed = preprocess(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc_preprocessed)
new_doc_topics = lda_model.get_document_topics(new_doc_bow)
获取原始文档的主题分布
doc_topics = [lda_model.get_document_topics(doc_bow) for doc_bow in corpus]
计算新文档与每个原始文档的相似性
similarities = []
for i, doc_topic in enumerate(doc_topics):
similarity = cossim(new_doc_topics, doc_topic)
similarities.append((i, similarity))
输出相似性结果
print("Similarity between new document and each original document:")
for i, similarity in similarities:
print(f"Document {i}: Similarity = {similarity}")