lda模型:官方处理方式和自己处理数据对比

自己处理数据,然后分批训练,第一步先对比自己处理的方式和官方是否一致。

官方的代码

python 复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel

# 示例数据
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey"
]

# 预处理数据
texts = [[word for word in document.lower().split()] for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


# 训练 LDA 模型
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=2024)


# 打印每个主题的关键词
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


# 推断新文档的主题分布
new_doc = "Human computer interaction"
new_doc_processed = [word for word in new_doc.lower().split()]
new_doc_bow = dictionary.doc2bow(new_doc_processed)
print(new_doc_bow)
print("New document topic distribution:", lda_model.get_document_topics(new_doc_bow))

结果

python 复制代码
Topic: 0
Words: 0.078*"graph" + 0.078*"trees" + 0.078*"the" + 0.078*"of" + 0.078*"in" + 0.078*"intersection" + 0.078*"paths" + 0.013*"minors" + 0.013*"interface" + 0.013*"survey"

Topic: 1
Words: 0.062*"of" + 0.034*"measurement" + 0.034*"relation" + 0.034*"to" + 0.034*"error" + 0.034*"perceived" + 0.034*"lab" + 0.034*"applications" + 0.034*"for" + 0.034*"machine"

Topic: 2
Words: 0.062*"minors" + 0.062*"trees" + 0.062*"the" + 0.062*"binary" + 0.062*"random" + 0.062*"generation" + 0.062*"unordered" + 0.062*"a" + 0.062*"survey" + 0.062*"graph"

Topic: 3
Words: 0.134*"system" + 0.073*"human" + 0.073*"eps" + 0.073*"and" + 0.073*"of" + 0.073*"engineering" + 0.073*"testing" + 0.012*"time" + 0.012*"user" + 0.012*"response"

Topic: 4
Words: 0.090*"of" + 0.090*"user" + 0.090*"system" + 0.049*"computer" + 0.049*"response" + 0.049*"time" + 0.049*"survey" + 0.049*"a" + 0.049*"interface" + 0.049*"management"

[(2, 1), (4, 1)]
New document topic distribution: [(0, 0.066698), (1, 0.7288686), (2, 0.06669144), (3, 0.06943816), (4, 0.068303764)]
python 复制代码
print(dictionary.token2id)

'''
{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'a': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 'system': 13, 'time': 14, 'user': 15, 'eps': 16, 'management': 17, 'the': 18, 'and': 19, 'engineering': 20, 'testing': 21, 'error': 22, 'measurement': 23, 'perceived': 24, 'relation': 25, 'to': 26, 'binary': 27, 'generation': 28, 'random': 29, 'trees': 30, 'unordered': 31, 'graph': 32, 'in': 33, 'intersection': 34, 'paths': 35, 'iv': 36, 'minors': 37, 'ordering': 38, 'quasi': 39, 'well': 40, 'widths': 41}
'''


print(corpus)

'''
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]
'''

自己处理方式

python 复制代码
def get_dictionary(input_data):
    output_dict = {}
    count = 0
    
    for l in input_data:
        l_list = l.strip().lower().split(" ")
        sorted_l_list = sorted(l_list)
        for k in sorted_l_list:
            if k not in output_dict:
                output_dict[k] = count
                count += 1
                
    return output_dict

my_dict = get_dictionary(documents)
print(my_dict)



def get_corpus(input_dict, input_data):
    output_list = []
    for l in input_data:
        tmp_dict = {}
        l_list = l.strip().lower().split(" ")
        for k in l_list:
            if k not in tmp_dict:
                tmp_dict[k] = 0
            tmp_dict[k] += 1
            
        tmp_list = []
        for k, v in tmp_dict.items():
            if k in input_dict.keys():
                tmp_list.append((input_dict[k], v))
            else:
                continue
        output_list.append(sorted(tmp_list))
        
    return output_list


my_corpus = get_corpus(my_dict, documents)
print(my_corpus)


def get_predict_corpus(input_dict, input_data):

    tmp_dict = {}
    l_list = input_data.strip().lower().split(" ")
    for k in l_list:
        if k not in tmp_dict:
            tmp_dict[k] = 0
        tmp_dict[k] += 1

    tmp_list = []
    for k, v in tmp_dict.items():
        if k in input_dict.keys():
            tmp_list.append((input_dict[k], v))
        else:
            continue

    return sorted(tmp_list)


'''
{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'a': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 'system': 13, 'time': 14, 'user': 15, 'eps': 16, 'management': 17, 'the': 18, 'and': 19, 'engineering': 20, 'testing': 21, 'error': 22, 'measurement': 23, 'perceived': 24, 'relation': 25, 'to': 26, 'binary': 27, 'generation': 28, 'random': 29, 'trees': 30, 'unordered': 31, 'graph': 32, 'in': 33, 'intersection': 34, 'paths': 35, 'iv': 36, 'minors': 37, 'ordering': 38, 'quasi': 39, 'well': 40, 'widths': 41}
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]
'''
python 复制代码
my_dict == dictionary.token2id

'''
True
'''

my_corpus == corpus

'''
True
'''
python 复制代码
# 训练 LDA 模型
my_lda_model = LdaModel(my_corpus, num_topics=5, passes=15, random_state=2024)
print(my_lda_model)


# 打印每个主题的关键词
for idx, topic in my_lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


# 推断新文档的主题分布
new_doc = "Human computer interaction"
new_doc_bow = get_predict_corpus(my_dict, new_doc)
print(new_doc_bow)
print("New document topic distribution:", lda_model.get_document_topics(new_doc_bow))

结果

python 复制代码
LdaModel<num_terms=42, num_topics=5, decay=0.5, chunksize=2000>
Topic: 0
Words: 0.078*"32" + 0.078*"30" + 0.078*"18" + 0.078*"9" + 0.078*"33" + 0.078*"34" + 0.078*"35" + 0.013*"37" + 0.013*"5" + 0.013*"12"

Topic: 1
Words: 0.062*"9" + 0.034*"23" + 0.034*"25" + 0.034*"26" + 0.034*"22" + 0.034*"24" + 0.034*"6" + 0.034*"1" + 0.034*"3" + 0.034*"7"

Topic: 2
Words: 0.062*"37" + 0.062*"30" + 0.062*"18" + 0.062*"27" + 0.062*"29" + 0.062*"28" + 0.062*"31" + 0.062*"8" + 0.062*"12" + 0.062*"32"

Topic: 3
Words: 0.134*"13" + 0.073*"4" + 0.073*"16" + 0.073*"19" + 0.073*"9" + 0.073*"20" + 0.073*"21" + 0.012*"14" + 0.012*"15" + 0.012*"11"

Topic: 4
Words: 0.090*"9" + 0.090*"15" + 0.090*"13" + 0.049*"2" + 0.049*"11" + 0.049*"14" + 0.049*"12" + 0.049*"8" + 0.049*"5" + 0.049*"17"

[(2, 1), (4, 1)]
New document topic distribution: [(0, 0.06669798), (1, 0.72894156), (2, 0.06669143), (3, 0.06936743), (4, 0.06830162)]
相关推荐
StarPrayers.2 分钟前
旅行商问题(TSP)(2)(heuristics.py)(TSP 的两种贪心启发式算法实现)
前端·人工智能·python·算法·pycharm·启发式算法
木头左23 分钟前
波动率聚类现象对ETF网格密度配置的启示与应对策略
python
华仔AI智能体1 小时前
Qwen3(通义千问3)、OpenAI GPT-5、DeepSeek 3.2、豆包最新模型(Doubao 4.0)通用模型能力对比
人工智能·python·语言模型·agent·智能体
盼哥PyAI实验室1 小时前
踏上编程征程,与 Python 共舞
开发语言·python
西柚小萌新1 小时前
【深入浅出PyTorch】--6.2.PyTorch进阶训练技巧2
人工智能·pytorch·python
weixin_307779131 小时前
使用Python高效读取ZIP压缩文件中的UTF-8 JSON数据到Pandas和PySpark DataFrame
开发语言·python·算法·自动化·json
哎呀呦呵2 小时前
python内置模块-re模块介绍使用
java·python·mysql
paoqi 包奇2 小时前
pycharm中使用anaconda指定虚拟环境
ide·python·pycharm
fsnine2 小时前
图像视觉——颜色识别
图像处理·python·计算机视觉