
  • 模块一:训练LDA模型
  • 模块二:困惑度计算
  • 模块三:得到一段文本的主题
  • 全部代码及案例(可直接运行)


pip install gensim


import gensim  # pip install gensim
from gensim import corporadef train_lda_model(all_contents, dictionary, num_topic=10):"""这是训练LDA的核心方法"""corpus = [dictionary.doc2bow(sentence) for sentence in all_contents]lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic)  # 核心代码return ldaif __name__ == '__main__':data = list(iter(open('data.txt')))data = [content.split() for content in data]try:dictionary = corpora.Dictionary(data)num_topic = 3  # 主题类型lda_model = train_lda_model(data, dictionary, num_topic=num_topic)  # 训练LDA模型lda_model.save('lda_' + str(num_topic) + '.model')  # 保存LDA模型except Exception as e:print(e)


in conjunction with the release of the the allen institute for ai partnered with
the recent outbreak of the deadly and highly infectious covid disease caused by
coronaviruses is related illness that vary from a common cold more severe
it is shown that the evaporation rate of a liquid sample containing the
covid illness an on going epidemic started in wuhan city china in december
in the beginning of december covid virus that slipped from animals humans in



import math
import gensimdef perplexity(ldamodel: gensim.models.LdaModel, data, dictionary: gensim.corpora.Dictionary):"""计算LDA模型困惑度:param ldamodel:  lda模型:param data: 计算困惑度需要训练数据:param dictionary: 文本处理后的Dictionary,使用corpora.Dictionary(my_data)处理训练gensim模型时的数据 my_data 后得到的:return: 返回困惑度"""size_dictionary = len(dictionary.keys())testset = []for i in data:testset.append(dictionary.doc2bow(i))num_topics = ldamodel.num_topicsprob_doc_sum = 0.0topic_word_list = []  # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...]for topic_id in range(num_topics):topic_word = ldamodel.show_topic(topic_id, size_dictionary)dic = {}for word, probability in topic_word:dic[word] = probabilitytopic_word_list.append(dic)doc_topics_ist = []  # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...]for doc in testset:doc_topics_ist.append(ldamodel.get_document_topics(doc, minimum_probability=0))testset_word_num = 0for i in range(len(testset)):prob_doc = 0.0  # the probablity of the docdoc = testset[i]doc_word_num = 0  # the num of words in the docfor word_id, num in doc:prob_word = 0.0  # the probablity of the worddoc_word_num += numword = dictionary[word_id]for topic_id in range(num_topics):# cal p(w) : p(w) = sumz(p(z)*p(w|z))prob_topic = doc_topics_ist[i][topic_id][1]prob_topic_word = topic_word_list[topic_id][word]prob_word += prob_topic * prob_topic_wordprob_doc += math.log(prob_word)  # p(d) = sum(log(p(w)))prob_doc_sum += prob_doctestset_word_num += doc_word_numprep = math.exp(-prob_doc_sum / testset_word_num)  # perplexity = exp(-sum(p(d)/sum(Nd))# print("LDA模型困惑度 : %s" % prep)return prep


perp = perplexity_cal.perplexity(lda_model, data, dictionary)




def get_topic_from_model(lda_model: gensim.models.ldamodel.LdaModel, text: str = "related illness that"):"""使用LDA模型得到文本主题"""text = [word for word in text.lower().split()]dictionary = corpora.Dictionary([text])bow = dictionary.doc2bow(text)return lda_model.get_document_topics(bow)


topic = get_topic_from_model(lda_model, text="related illness that")
print(topic) # [(0, 0.08674477), (1, 0.084886044), (2, 0.8283692)] 返回值含义为 (主题:概率)



import gensim  # pip install gensim
from gensim import corpora
import perplexity_caldef train_lda_model(all_contents, dictionary, num_topic=10):"""这是训练LDA的核心方法"""corpus = [dictionary.doc2bow(sentence) for sentence in all_contents]lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic)  # 核心代码return ldadef get_topic_from_model(lda_model: gensim.models.ldamodel.LdaModel, text: str = "related illness that"):"""使用LDA模型得到文本主题"""text = [word for word in text.lower().split()]dictionary = corpora.Dictionary([text])bow = dictionary.doc2bow(text)return lda_model.get_document_topics(bow)if __name__ == '__main__':data = list(iter(open('data.txt')))data = [content.split() for content in data]try:dictionary = corpora.Dictionary(data)num_topic = 3  # 主题类型lda_model = train_lda_model(data, dictionary, num_topic=num_topic)  # 训练LDA模型# lda_model.save('lda_' + str(num_topic) + '.model')  # 保存LDA模型# 计算困惑度perp = perplexity_cal.perplexity(lda_model, data, dictionary)print("LDA困惑度:  topic:", str(num_topic) + " value: " + str(perp))# 测试一个文章的主题topic = get_topic_from_model(lda_model, text="related illness that")print(topic)except Exception as e:print(e)


