lda模型:官方处理方式和自己处理数据对比

自己处理数据,然后分批训练,第一步先对比自己处理的方式和官方是否一致。

官方的代码

python 复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel

# 示例数据
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey"
]

# 预处理数据
texts = [[word for word in document.lower().split()] for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


# 训练 LDA 模型
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=2024)


# 打印每个主题的关键词
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


# 推断新文档的主题分布
new_doc = "Human computer interaction"
new_doc_processed = [word for word in new_doc.lower().split()]
new_doc_bow = dictionary.doc2bow(new_doc_processed)
print(new_doc_bow)
print("New document topic distribution:", lda_model.get_document_topics(new_doc_bow))

结果

python 复制代码
Topic: 0
Words: 0.078*"graph" + 0.078*"trees" + 0.078*"the" + 0.078*"of" + 0.078*"in" + 0.078*"intersection" + 0.078*"paths" + 0.013*"minors" + 0.013*"interface" + 0.013*"survey"

Topic: 1
Words: 0.062*"of" + 0.034*"measurement" + 0.034*"relation" + 0.034*"to" + 0.034*"error" + 0.034*"perceived" + 0.034*"lab" + 0.034*"applications" + 0.034*"for" + 0.034*"machine"

Topic: 2
Words: 0.062*"minors" + 0.062*"trees" + 0.062*"the" + 0.062*"binary" + 0.062*"random" + 0.062*"generation" + 0.062*"unordered" + 0.062*"a" + 0.062*"survey" + 0.062*"graph"

Topic: 3
Words: 0.134*"system" + 0.073*"human" + 0.073*"eps" + 0.073*"and" + 0.073*"of" + 0.073*"engineering" + 0.073*"testing" + 0.012*"time" + 0.012*"user" + 0.012*"response"

Topic: 4
Words: 0.090*"of" + 0.090*"user" + 0.090*"system" + 0.049*"computer" + 0.049*"response" + 0.049*"time" + 0.049*"survey" + 0.049*"a" + 0.049*"interface" + 0.049*"management"

[(2, 1), (4, 1)]
New document topic distribution: [(0, 0.066698), (1, 0.7288686), (2, 0.06669144), (3, 0.06943816), (4, 0.068303764)]
python 复制代码
print(dictionary.token2id)

'''
{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'a': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 'system': 13, 'time': 14, 'user': 15, 'eps': 16, 'management': 17, 'the': 18, 'and': 19, 'engineering': 20, 'testing': 21, 'error': 22, 'measurement': 23, 'perceived': 24, 'relation': 25, 'to': 26, 'binary': 27, 'generation': 28, 'random': 29, 'trees': 30, 'unordered': 31, 'graph': 32, 'in': 33, 'intersection': 34, 'paths': 35, 'iv': 36, 'minors': 37, 'ordering': 38, 'quasi': 39, 'well': 40, 'widths': 41}
'''


print(corpus)

'''
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]
'''

自己处理方式

python 复制代码
def get_dictionary(input_data):
    output_dict = {}
    count = 0
    
    for l in input_data:
        l_list = l.strip().lower().split(" ")
        sorted_l_list = sorted(l_list)
        for k in sorted_l_list:
            if k not in output_dict:
                output_dict[k] = count
                count += 1
                
    return output_dict

my_dict = get_dictionary(documents)
print(my_dict)



def get_corpus(input_dict, input_data):
    output_list = []
    for l in input_data:
        tmp_dict = {}
        l_list = l.strip().lower().split(" ")
        for k in l_list:
            if k not in tmp_dict:
                tmp_dict[k] = 0
            tmp_dict[k] += 1
            
        tmp_list = []
        for k, v in tmp_dict.items():
            if k in input_dict.keys():
                tmp_list.append((input_dict[k], v))
            else:
                continue
        output_list.append(sorted(tmp_list))
        
    return output_list


my_corpus = get_corpus(my_dict, documents)
print(my_corpus)


def get_predict_corpus(input_dict, input_data):

    tmp_dict = {}
    l_list = input_data.strip().lower().split(" ")
    for k in l_list:
        if k not in tmp_dict:
            tmp_dict[k] = 0
        tmp_dict[k] += 1

    tmp_list = []
    for k, v in tmp_dict.items():
        if k in input_dict.keys():
            tmp_list.append((input_dict[k], v))
        else:
            continue

    return sorted(tmp_list)


'''
{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'a': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 'system': 13, 'time': 14, 'user': 15, 'eps': 16, 'management': 17, 'the': 18, 'and': 19, 'engineering': 20, 'testing': 21, 'error': 22, 'measurement': 23, 'perceived': 24, 'relation': 25, 'to': 26, 'binary': 27, 'generation': 28, 'random': 29, 'trees': 30, 'unordered': 31, 'graph': 32, 'in': 33, 'intersection': 34, 'paths': 35, 'iv': 36, 'minors': 37, 'ordering': 38, 'quasi': 39, 'well': 40, 'widths': 41}
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]
'''
python 复制代码
my_dict == dictionary.token2id

'''
True
'''

my_corpus == corpus

'''
True
'''
python 复制代码
# 训练 LDA 模型
my_lda_model = LdaModel(my_corpus, num_topics=5, passes=15, random_state=2024)
print(my_lda_model)


# 打印每个主题的关键词
for idx, topic in my_lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


# 推断新文档的主题分布
new_doc = "Human computer interaction"
new_doc_bow = get_predict_corpus(my_dict, new_doc)
print(new_doc_bow)
print("New document topic distribution:", lda_model.get_document_topics(new_doc_bow))

结果

python 复制代码
LdaModel<num_terms=42, num_topics=5, decay=0.5, chunksize=2000>
Topic: 0
Words: 0.078*"32" + 0.078*"30" + 0.078*"18" + 0.078*"9" + 0.078*"33" + 0.078*"34" + 0.078*"35" + 0.013*"37" + 0.013*"5" + 0.013*"12"

Topic: 1
Words: 0.062*"9" + 0.034*"23" + 0.034*"25" + 0.034*"26" + 0.034*"22" + 0.034*"24" + 0.034*"6" + 0.034*"1" + 0.034*"3" + 0.034*"7"

Topic: 2
Words: 0.062*"37" + 0.062*"30" + 0.062*"18" + 0.062*"27" + 0.062*"29" + 0.062*"28" + 0.062*"31" + 0.062*"8" + 0.062*"12" + 0.062*"32"

Topic: 3
Words: 0.134*"13" + 0.073*"4" + 0.073*"16" + 0.073*"19" + 0.073*"9" + 0.073*"20" + 0.073*"21" + 0.012*"14" + 0.012*"15" + 0.012*"11"

Topic: 4
Words: 0.090*"9" + 0.090*"15" + 0.090*"13" + 0.049*"2" + 0.049*"11" + 0.049*"14" + 0.049*"12" + 0.049*"8" + 0.049*"5" + 0.049*"17"

[(2, 1), (4, 1)]
New document topic distribution: [(0, 0.06669798), (1, 0.72894156), (2, 0.06669143), (3, 0.06936743), (4, 0.06830162)]
相关推荐
熊猫在哪8 分钟前
野火鲁班猫(arrch64架构debian)从零实现用MobileFaceNet算法进行实时人脸识别(一)conda环境搭建
linux·人工智能·python·嵌入式硬件·神经网络·机器学习·边缘计算
满怀101513 分钟前
【Python中的Socket套接字详解】网络通信的核心基石
开发语言·网络·python·网络编程·socket
pk_xz12345630 分钟前
Python程序实现了一个完整的车辆信息查询系统
开发语言·python·opencv
Stuomasi_xiaoxin40 分钟前
FFmpeg 超级详细安装与配置教程(Windows 系统)
python·深度学习·ai·ffmpeg
z人间防沉迷k1 小时前
高效查询:位图、B+树
开发语言·数据结构·笔记·python·算法
.小墨迹2 小时前
Python学习——执行python时,键盘按下ctrl+c,退出程序
linux·开发语言·python·学习·自动驾驶
蓝婷儿2 小时前
6个月Python学习计划 Day 1
开发语言·python·学习
chicpopoo2 小时前
Python打卡DAY33
开发语言·python
Bugabooo2 小时前
python 打卡DAY27
开发语言·python
YUNYINGXIA3 小时前
Python实现Web请求与响应
开发语言·前端·python