lda模型:官方处理方式和自己处理数据对比

自己处理数据,然后分批训练,第一步先对比自己处理的方式和官方是否一致。

官方的代码

python 复制代码
import gensim
from gensim import corpora
from gensim.models import LdaModel

# 示例数据
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey"
]

# 预处理数据
texts = [[word for word in document.lower().split()] for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


# 训练 LDA 模型
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=2024)


# 打印每个主题的关键词
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


# 推断新文档的主题分布
new_doc = "Human computer interaction"
new_doc_processed = [word for word in new_doc.lower().split()]
new_doc_bow = dictionary.doc2bow(new_doc_processed)
print(new_doc_bow)
print("New document topic distribution:", lda_model.get_document_topics(new_doc_bow))

结果

python 复制代码
Topic: 0
Words: 0.078*"graph" + 0.078*"trees" + 0.078*"the" + 0.078*"of" + 0.078*"in" + 0.078*"intersection" + 0.078*"paths" + 0.013*"minors" + 0.013*"interface" + 0.013*"survey"

Topic: 1
Words: 0.062*"of" + 0.034*"measurement" + 0.034*"relation" + 0.034*"to" + 0.034*"error" + 0.034*"perceived" + 0.034*"lab" + 0.034*"applications" + 0.034*"for" + 0.034*"machine"

Topic: 2
Words: 0.062*"minors" + 0.062*"trees" + 0.062*"the" + 0.062*"binary" + 0.062*"random" + 0.062*"generation" + 0.062*"unordered" + 0.062*"a" + 0.062*"survey" + 0.062*"graph"

Topic: 3
Words: 0.134*"system" + 0.073*"human" + 0.073*"eps" + 0.073*"and" + 0.073*"of" + 0.073*"engineering" + 0.073*"testing" + 0.012*"time" + 0.012*"user" + 0.012*"response"

Topic: 4
Words: 0.090*"of" + 0.090*"user" + 0.090*"system" + 0.049*"computer" + 0.049*"response" + 0.049*"time" + 0.049*"survey" + 0.049*"a" + 0.049*"interface" + 0.049*"management"

[(2, 1), (4, 1)]
New document topic distribution: [(0, 0.066698), (1, 0.7288686), (2, 0.06669144), (3, 0.06943816), (4, 0.068303764)]
python 复制代码
print(dictionary.token2id)

'''
{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'a': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 'system': 13, 'time': 14, 'user': 15, 'eps': 16, 'management': 17, 'the': 18, 'and': 19, 'engineering': 20, 'testing': 21, 'error': 22, 'measurement': 23, 'perceived': 24, 'relation': 25, 'to': 26, 'binary': 27, 'generation': 28, 'random': 29, 'trees': 30, 'unordered': 31, 'graph': 32, 'in': 33, 'intersection': 34, 'paths': 35, 'iv': 36, 'minors': 37, 'ordering': 38, 'quasi': 39, 'well': 40, 'widths': 41}
'''


print(corpus)

'''
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]
'''

自己处理方式

python 复制代码
def get_dictionary(input_data):
    output_dict = {}
    count = 0
    
    for l in input_data:
        l_list = l.strip().lower().split(" ")
        sorted_l_list = sorted(l_list)
        for k in sorted_l_list:
            if k not in output_dict:
                output_dict[k] = count
                count += 1
                
    return output_dict

my_dict = get_dictionary(documents)
print(my_dict)



def get_corpus(input_dict, input_data):
    output_list = []
    for l in input_data:
        tmp_dict = {}
        l_list = l.strip().lower().split(" ")
        for k in l_list:
            if k not in tmp_dict:
                tmp_dict[k] = 0
            tmp_dict[k] += 1
            
        tmp_list = []
        for k, v in tmp_dict.items():
            if k in input_dict.keys():
                tmp_list.append((input_dict[k], v))
            else:
                continue
        output_list.append(sorted(tmp_list))
        
    return output_list


my_corpus = get_corpus(my_dict, documents)
print(my_corpus)


def get_predict_corpus(input_dict, input_data):

    tmp_dict = {}
    l_list = input_data.strip().lower().split(" ")
    for k in l_list:
        if k not in tmp_dict:
            tmp_dict[k] = 0
        tmp_dict[k] += 1

    tmp_list = []
    for k, v in tmp_dict.items():
        if k in input_dict.keys():
            tmp_list.append((input_dict[k], v))
        else:
            continue

    return sorted(tmp_list)


'''
{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'a': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 'system': 13, 'time': 14, 'user': 15, 'eps': 16, 'management': 17, 'the': 18, 'and': 19, 'engineering': 20, 'testing': 21, 'error': 22, 'measurement': 23, 'perceived': 24, 'relation': 25, 'to': 26, 'binary': 27, 'generation': 28, 'random': 29, 'trees': 30, 'unordered': 31, 'graph': 32, 'in': 33, 'intersection': 34, 'paths': 35, 'iv': 36, 'minors': 37, 'ordering': 38, 'quasi': 39, 'well': 40, 'widths': 41}
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(5, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(4, 1), (9, 1), (13, 2), (16, 1), (19, 1), (20, 1), (21, 1)], [(9, 1), (11, 1), (14, 1), (15, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (18, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(9, 1), (18, 1), (30, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(9, 1), (19, 1), (30, 1), (32, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(8, 1), (12, 1), (32, 1), (37, 1)]]
'''
python 复制代码
my_dict == dictionary.token2id

'''
True
'''

my_corpus == corpus

'''
True
'''
python 复制代码
# 训练 LDA 模型
my_lda_model = LdaModel(my_corpus, num_topics=5, passes=15, random_state=2024)
print(my_lda_model)


# 打印每个主题的关键词
for idx, topic in my_lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


# 推断新文档的主题分布
new_doc = "Human computer interaction"
new_doc_bow = get_predict_corpus(my_dict, new_doc)
print(new_doc_bow)
print("New document topic distribution:", lda_model.get_document_topics(new_doc_bow))

结果

python 复制代码
LdaModel<num_terms=42, num_topics=5, decay=0.5, chunksize=2000>
Topic: 0
Words: 0.078*"32" + 0.078*"30" + 0.078*"18" + 0.078*"9" + 0.078*"33" + 0.078*"34" + 0.078*"35" + 0.013*"37" + 0.013*"5" + 0.013*"12"

Topic: 1
Words: 0.062*"9" + 0.034*"23" + 0.034*"25" + 0.034*"26" + 0.034*"22" + 0.034*"24" + 0.034*"6" + 0.034*"1" + 0.034*"3" + 0.034*"7"

Topic: 2
Words: 0.062*"37" + 0.062*"30" + 0.062*"18" + 0.062*"27" + 0.062*"29" + 0.062*"28" + 0.062*"31" + 0.062*"8" + 0.062*"12" + 0.062*"32"

Topic: 3
Words: 0.134*"13" + 0.073*"4" + 0.073*"16" + 0.073*"19" + 0.073*"9" + 0.073*"20" + 0.073*"21" + 0.012*"14" + 0.012*"15" + 0.012*"11"

Topic: 4
Words: 0.090*"9" + 0.090*"15" + 0.090*"13" + 0.049*"2" + 0.049*"11" + 0.049*"14" + 0.049*"12" + 0.049*"8" + 0.049*"5" + 0.049*"17"

[(2, 1), (4, 1)]
New document topic distribution: [(0, 0.06669798), (1, 0.72894156), (2, 0.06669143), (3, 0.06936743), (4, 0.06830162)]
相关推荐
MemoriKu1 小时前
Flutter 本地 AI 相册工程收口:从屏幕常亮、标签体系到照片属性后台队列
大数据·人工智能·python·flutter·elasticsearch·搜索引擎·数据库架构
2401_885665191 小时前
基于OpenCV的模板匹配OCR实战:银行卡与身份证数字识别完整教程
人工智能·python·opencv·计算机视觉·ocr
装不满的克莱因瓶1 小时前
了解3D卷积原理——从空间感知到时空建模的深度学习核心算子
人工智能·pytorch·python·深度学习·机器学习·3d·ai
cup112 小时前
[开源] Memory Checker:极致轻量的 Windows 托盘内存监测工具,告别内存焦虑
python·内存·工具·任务管理器·托盘
码云骑士2 小时前
23-Django-ORM的N+1问题-select_related与prefetch_related详解
后端·python·django
Tbisnic2 小时前
AI大模型学习第十四天:Coze项目实战中的分治智慧
人工智能·python·学习·大模型·工作流·智能体·coze
master3362 小时前
python 安装pip
开发语言·python·pip
江畔柳前堤2 小时前
github实战指南03-Pull Request 全流程实战
开发语言·人工智能·python·深度学习·github·word
AC赳赳老秦2 小时前
OpenClaw + 飞书多维表格:自动同步数据、生成统计图表、触发自动化任务
java·大数据·python·缓存·自动化·deepseek·openclaw