#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Latent Dirichlet Allocation + collapsed Gibbs sampling
# This code is available under the MIT License.
# (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc.
import numpy
import scipy as sp
from scipy.special import gammaln
class LDA:
def __init__(self, K, alpha, beta, docs, V, smartinit=True):
self.K = K
self.alpha = alpha # parameter of topics prior
self.beta = beta # parameter of words prior
self.docs = docs
self.V = V
self.z_m_n = [] # topics of words of documents
self.n_m_z = numpy.zeros((len(self.docs), K)) + alpha # word count of each document and topic
self.n_z_t = numpy.zeros((K, V)) + beta # word count of each topic and vocabulary
self.n_z = numpy.zeros(K) + V * beta # word count of each topic
self.n_m_t = numpy.zeros((len(self.docs), self.V)) #word count of each document and word
self.N = 0
for m, doc in enumerate(docs):
self.N += len(doc)
z_n = []
for t in doc:
if smartinit:
p_z = self.n_z_t[:, t] * self.n_m_z[m] / self.n_z
z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
else:
z = numpy.random.randint(0, K)
z_n.append(z)
self.n_m_z[m, z] += 1
self.n_z_t[z, t] += 1
self.n_z[z] += 1
self.n_m_t[m,t]+=1
self.z_m_n.append(numpy.array(z_n))
def likelihoodOfAllWords(self):
"""
Compute the likelihood that the model generated the data,so called p(W|T)
"""
result = 0.0
for m in xrange(self.n_m_z.shape[0]):
for t in xrange(self.n_z_t.shape[1]):
tmp = 0.0
for z in xrange(self.K):
tmp+=(self.n_z_t[z,t]/self.n_z_t[z,:].sum())*(self.n_m_z[m,z]/self.n_m_z[m,:].sum())
result+=self.n_m_t[m,t]*numpy.log(tmp)
return result
# get this function from lda_gibbs.py
def loglikelihood(self):
"""
Compute the likelihood that the model generated the data.
i wanna go
"""
vocab_size = self.n_z_t.shape[1]
n_docs = self.n_m_z.shape[0]
lik = 0
for z in xrange(self.K):
lik += log_multi_beta(self.n_z_t[z,:]+self.beta)
lik -= log_multi_beta(self.beta, vocab_size)
for m in xrange(n_docs):
lik += log_multi_beta(self.n_m_z[m,:]+self.alpha)
lik -= log_multi_beta(self.alpha, self.K)
return lik
def inference(self):
"""learning once iteration"""
for m, doc in enumerate(self.docs):
z_n = self.z_m_n[m]
n_m_z = self.n_m_z[m]
for n, t in enumerate(doc):
# discount for n-th word t with topic z
z = z_n[n]
n_m_z[z] -= 1
self.n_z_t[z, t] -= 1
self.n_z[z] -= 1
# sampling topic new_z for t
p_z = self.n_z_t[:, t] * n_m_z / self.n_z
new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
# set z the new topic and increment counters
z_n[n] = new_z
n_m_z[new_z] += 1
self.n_z_t[new_z, t] += 1
self.n_z[new_z] += 1
def worddist(self):
"""get topic-word distribution"""
return self.n_z_t / self.n_z[:, numpy.newaxis]
def perplexity(self, docs=None):
if docs == None: docs = self.docs
phi = self.worddist()
log_per = 0
N = 0
Kalpha = self.K * self.alpha
for m, doc in enumerate(docs):
theta = self.n_m_z[m] / (len(self.docs[m]) + Kalpha)
for w in doc:
log_per -= numpy.log(numpy.inner(phi[:,w], theta))
N += len(doc)
return numpy.exp(log_per / N)
def lda_learning(lda, iteration, voca):
pre_perp = lda.perplexity()
print "initial perplexity=%f" % pre_perp
for i in range(iteration):
lda.inference()
perp = lda.perplexity()
print "-%d p=%f" % (i + 1, perp)
if pre_perp:
if pre_perp < perp:
output_word_topic_dist(lda, voca)
pre_perp = None
else:
pre_perp = perp
output_word_topic_dist(lda, voca)
def output_word_topic_dist(lda, voca):
zcount = numpy.zeros(lda.K, dtype=int)
wordcount = [dict() for k in xrange(lda.K)]
for xlist, zlist in zip(lda.docs, lda.z_m_n):
for x, z in zip(xlist, zlist):
zcount[z] += 1
if x in wordcount[z]:
wordcount[z][x] += 1
else:
wordcount[z][x] = 1
phi = lda.worddist()
for k in xrange(lda.K):
print "\n-- topic: %d (%d words)" % (k, zcount[k])
for w in numpy.argsort(-phi[k])[:20]:
print "%s: %f (%d)" % (voca[w], phi[k,w], wordcount[k].get(w,0))
def log_multi_beta(alpha, K=None):
"""
Logarithm of the multinomial beta function.
"""
if K is None:
# alpha is assumed to be a vector
return numpy.sum(gammaln(alpha)) - gammaln(numpy.sum(alpha))
else:
# alpha is assumed to be a scalar
return K * gammaln(alpha) - gammaln(K*alpha)
def main():
import optparse
import vocabulary
parser = optparse.OptionParser()
parser.add_option("-f", dest="filename", help="corpus filename")
parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
parser.add_option("--seed", dest="seed", type="int", help="random seed")
parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
(options, args) = parser.parse_args()
if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")
if options.filename:
corpus = vocabulary.load_file(options.filename)
else:
corpus = vocabulary.load_corpus(options.corpus)
if not corpus: parser.error("corpus range(-c) forms 'start:end'")
if options.seed != None:
numpy.random.seed(options.seed)
voca = vocabulary.Vocabulary(options.stopwords)
docs = [voca.doc_to_ids(doc) for doc in corpus]
if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit)
print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)
#import cProfile
#cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
lda_learning(lda, options.iteration, voca)
if __name__ == "__main__":
main()
lda.zip_ida python_ida算法 python_lda_pda_lda算法python
版权申诉
18 浏览量
2022-09-21
01:46:11
上传
评论
收藏 2KB ZIP 举报
Kinonoyomeo
- 粉丝: 74
- 资源: 1万+
最新资源
- 高性能量化工具 hikyuu 2.0.3 python3.9 ubuntu 安装包
- Cyclone Version 9.51
- 高性能量化回测工具 hikyuu 2.0.3 python 3.12 windows 安装包
- 省级城乡居民基本养老保险情况数据集(2010-2022年).xlsx
- 舞队填写版.cpp
- 基于BP神经网络的多输入单输出回归预测.zip
- 高性能量化回测工具 hikyuu 2.0.3 python 3.9 windows 安装包
- 省级城镇职工基本养老保险情况2000-2022年.xlsx
- 高性能量化回测工具 hikyuu 2.0.3 python 3.10 windows 安装包
- 算法部署-使用OpenVINO+C#部署PaddleOCR字符识别算法-项目源码-优质项目实战.zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈