lda模型:官方处理方式和自己处理数据对比

自己处理数据,然后分批训练,第一步先对比自己处理的方式和官方是否一致。

官方的代码

python 复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel

# 示例数据
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey"
]

# 预处理数据
texts = [[word for word in document.lower().split()] for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


# 训练 LDA 模型
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=2024)


# 打印每个主题的关键词
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


# 推断新文档的主题分布
new_doc = "Human computer interaction"
new_doc_processed = [word for word in new_doc.lower().split()]
new_doc_bow = dictionary.doc2bow(new_doc_processed)
print(new_doc_bow)
print("New document topic distribution:", lda_model.get_document_topics(new_doc_bow))

结果

python 复制代码
Topic: 0
Words: 0.078*"graph" + 0.078*"trees" + 0.078*"the" + 0.078*"of" + 0.078*"in" + 0.078*"intersection" + 0.078*"paths" + 0.013*"minors" + 0.013*"interface" + 0.013*"survey"

Topic: 1
Words: 0.062*"of" + 0.034*"measurement" + 0.034*"relation" + 0.034*"to" + 0.034*"error" + 0.034*"perceived" + 0.034*"lab" + 0.034*"applications" + 0.034*"for" + 0.034*"machine"

Topic: 2
Words: 0.062*"minors" + 0.062*"trees" + 0.062*"the" + 0.062*"binary" + 0.062*"random" + 0.062*"generation" + 0.062*"unordered" + 0.062*"a" + 0.062*"survey" + 0.062*"graph"

Topic: 3
Words: 0.134*"system" + 0.073*"human" + 0.073*"eps" + 0.073*"and" + 0.073*"of" + 0.073*"engineering" + 0.073*"testing" + 0.012*"time" + 0.012*"user" + 0.012*"response"

Topic: 4
Words: 0.090*"of" + 0.090*"user" + 0.090*"system" + 0.049*"computer" + 0.049*"response" + 0.049*"time" + 0.049*"survey" + 0.049*"a" + 0.049*"interface" + 0.049*"management"

[(2, 1), (4, 1)]
New document topic distribution: [(0, 0.066698), (1, 0.7288686), (2, 0.06669144), (3, 0.06943816), (4, 0.068303764)]
python 复制代码
print(dictionary.token2id)

'''
{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'a': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 'system': 13, 'time': 14, 'user': 15, 'eps': 16, 'management': 17, 'the': 18, 'and': 19, 'engineering': 20, 'testing': 21, 'error': 22, 'measurement': 23, 'perceived': 24, 'relation': 25, 'to': 26, 'binary': 27, 'generation': 28, 'random': 29, 'trees': 30, 'unordered': 31, 'graph': 32, 'in': 33, 'intersection': 34, 'paths': 35, 'iv': 36, 'minors': 37, 'ordering': 38, 'quasi': 39, 'well': 40, 'widths': 41}
'''


print(corpus)

'''
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]
'''

自己处理方式

python 复制代码
def get_dictionary(input_data):
    output_dict = {}
    count = 0
    
    for l in input_data:
        l_list = l.strip().lower().split(" ")
        sorted_l_list = sorted(l_list)
        for k in sorted_l_list:
            if k not in output_dict:
                output_dict[k] = count
                count += 1
                
    return output_dict

my_dict = get_dictionary(documents)
print(my_dict)



def get_corpus(input_dict, input_data):
    output_list = []
    for l in input_data:
        tmp_dict = {}
        l_list = l.strip().lower().split(" ")
        for k in l_list:
            if k not in tmp_dict:
                tmp_dict[k] = 0
            tmp_dict[k] += 1
            
        tmp_list = []
        for k, v in tmp_dict.items():
            if k in input_dict.keys():
                tmp_list.append((input_dict[k], v))
            else:
                continue
        output_list.append(sorted(tmp_list))
        
    return output_list


my_corpus = get_corpus(my_dict, documents)
print(my_corpus)


def get_predict_corpus(input_dict, input_data):

    tmp_dict = {}
    l_list = input_data.strip().lower().split(" ")
    for k in l_list:
        if k not in tmp_dict:
            tmp_dict[k] = 0
        tmp_dict[k] += 1

    tmp_list = []
    for k, v in tmp_dict.items():
        if k in input_dict.keys():
            tmp_list.append((input_dict[k], v))
        else:
            continue

    return sorted(tmp_list)


'''
{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'a': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 'system': 13, 'time': 14, 'user': 15, 'eps': 16, 'management': 17, 'the': 18, 'and': 19, 'engineering': 20, 'testing': 21, 'error': 22, 'measurement': 23, 'perceived': 24, 'relation': 25, 'to': 26, 'binary': 27, 'generation': 28, 'random': 29, 'trees': 30, 'unordered': 31, 'graph': 32, 'in': 33, 'intersection': 34, 'paths': 35, 'iv': 36, 'minors': 37, 'ordering': 38, 'quasi': 39, 'well': 40, 'widths': 41}
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]
'''
python 复制代码
my_dict == dictionary.token2id

'''
True
'''

my_corpus == corpus

'''
True
'''
python 复制代码
# 训练 LDA 模型
my_lda_model = LdaModel(my_corpus, num_topics=5, passes=15, random_state=2024)
print(my_lda_model)


# 打印每个主题的关键词
for idx, topic in my_lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


# 推断新文档的主题分布
new_doc = "Human computer interaction"
new_doc_bow = get_predict_corpus(my_dict, new_doc)
print(new_doc_bow)
print("New document topic distribution:", lda_model.get_document_topics(new_doc_bow))

结果

python 复制代码
LdaModel<num_terms=42, num_topics=5, decay=0.5, chunksize=2000>
Topic: 0
Words: 0.078*"32" + 0.078*"30" + 0.078*"18" + 0.078*"9" + 0.078*"33" + 0.078*"34" + 0.078*"35" + 0.013*"37" + 0.013*"5" + 0.013*"12"

Topic: 1
Words: 0.062*"9" + 0.034*"23" + 0.034*"25" + 0.034*"26" + 0.034*"22" + 0.034*"24" + 0.034*"6" + 0.034*"1" + 0.034*"3" + 0.034*"7"

Topic: 2
Words: 0.062*"37" + 0.062*"30" + 0.062*"18" + 0.062*"27" + 0.062*"29" + 0.062*"28" + 0.062*"31" + 0.062*"8" + 0.062*"12" + 0.062*"32"

Topic: 3
Words: 0.134*"13" + 0.073*"4" + 0.073*"16" + 0.073*"19" + 0.073*"9" + 0.073*"20" + 0.073*"21" + 0.012*"14" + 0.012*"15" + 0.012*"11"

Topic: 4
Words: 0.090*"9" + 0.090*"15" + 0.090*"13" + 0.049*"2" + 0.049*"11" + 0.049*"14" + 0.049*"12" + 0.049*"8" + 0.049*"5" + 0.049*"17"

[(2, 1), (4, 1)]
New document topic distribution: [(0, 0.06669798), (1, 0.72894156), (2, 0.06669143), (3, 0.06936743), (4, 0.06830162)]
相关推荐
User_芊芊君子5 分钟前
【JavaSE】复习总结
java·开发语言·python
计算机毕业设计木哥7 分钟前
计算机毕业设计 基于Python+Django的医疗数据分析系统
开发语言·hadoop·后端·python·spark·django·课程设计
Python×CATIA工业智造12 分钟前
Python索引-值对迭代完全指南:从基础到高性能系统设计
python·pycharm
Luchang-Li1 小时前
sglang pytorch NCCL hang分析
pytorch·python·nccl
一个天蝎座 白勺 程序猿6 小时前
Python爬虫(47)Python异步爬虫与K8S弹性伸缩:构建百万级并发数据采集引擎
爬虫·python·kubernetes
XiaoMu_0017 小时前
基于Django+Vue3+YOLO的智能气象检测系统
python·yolo·django
honder试试8 小时前
焊接自动化测试平台图像处理分析-模型训练推理
开发语言·python
心本无晴.8 小时前
Python进程,线程
python·进程
java1234_小锋11 小时前
Scikit-learn Python机器学习 - 特征降维 压缩数据 - 特征提取 - 主成分分析 (PCA)
python·机器学习·scikit-learn
java1234_小锋11 小时前
Scikit-learn Python机器学习 - 特征降维 压缩数据 - 特征提取 - 线性判别分析 (LDA)
python·机器学习·scikit-learn