相信对LDA感兴趣的同学应该都看过M.Blei、Andre Y.
Ng和Micheal I.
Jordan合著的论文“Latent
Direchlet Allocation”。文章除了概述常见的隐藏变量模型(Unigram Model、Mixture of unigrams、Probabilistic Latent semantic
indexing)外,重点是介绍LDA的算法推导和实现流程。
本篇水文主要目的是介绍LDA的程序实现流程,避免只读文章而带来“虚”感。图1为大家熟知的LDA图模型,详细介绍可参见上一篇水文“LDA学习摘要”。
http://s4/mw690/001pxa1yty6EBwVD3a313&690
图1 LDA图模型
求解LDA模型就是估计参数alpha和beta,目标函数可以表达如下,其中z为隐藏的主题变量。悲催的是,直接求解这个目标函数很难下手,需要另辟蹊径,这条蹊径就是变分推理(convexity-based variational
inference)。
http://s12/mw690/001pxa1yty6EBwXzRXl0b&690
引入变分参数gamma和phi,LDA图模型和简化为图2,gamma和phi的求解可以通过解如下最优化目标函数实现。
http://s15/mw690/001pxa1yty6EBwYWmqade&690
最后通过推导(见文章附录),可以得到如下关系式,其中n为文档标记,i为主题标记。
http://s1/mw690/001pxa1yty6EBwZMIlG80&690
http://s4/mw690/001pxa1yty6EBx1o9H503&690
图2 LDA变分图模型
基于上面的介绍,我们可以得到图3所示的求解gamma和phi的变分推理过程。
http://s15/mw690/001pxa1yty6EBx2hEXIce&690
图3 gamma和phi求解
说到这里,有同学可能会问,LDA模型输出是alpha和beta,这里给出gamma和phi的计算过程有何用?
细心的同学可能会看到,图3中gamma和phi的求解依赖于alpha和phi,而基于文章附录的推导可以得到图4和图5所示的beta和alpha求解公式,同样,其依赖于gamma和phi。
http://s9/mw690/001pxa1yty6EBx3Thhe08&690
图4 beta求解
http://s6/mw690/001pxa1yty6EBx4PsvHe5&690
图5 alpha求解
观察图3、图4和图5,很自然地就会想到E-M思想:E过程就是求解gamma和phi,M过程就是求解alpha和beta,通过迭代即可实现最终LDA的模型求解。
好了,说了那么多,让我们看看LDA的程序实现,彻底摆脱“虚”的感觉(对照图3、图4和图5读程序就很容易明白)。
run_em(random, out_dir,
corpus)
void run_em(char* start, char* directory, corpus*
corpus)
{
var_gamma =
malloc(sizeof(double*)*(corpus->num_docs)); //语料数*主题数
for (d = 0; d <
corpus->num_docs; d++)
nvar_gamma[d] =
malloc(sizeof(double) * NTOPICS);
int max_length =
max_corpus_length(corpus); // 单词最多的一个语料的单词数量
phi =
malloc(sizeof(double*)*max_length);
for (n = 0; n
< max_length; n++)
phi[n] =
malloc(sizeof(double) * NTOPICS);
// 初始化模型 (random)
model =
new_lda_model(corpus->num_terms, NTOPICS);
lda_model* new_lda_model(int num_terms, int
num_topics)
{
model =
malloc(sizeof(lda_model));
model->num_topics = num_topics;
model->num_terms = num_terms;
model->alpha =
1.0;
model->log_prob_w =
malloc(sizeof(double*)*num_topics); //主题数*单词数的矩阵,应该是存储每个word属于该主题的概率
model->log_prob_w[i] =
malloc(sizeof(double)*num_terms);
model->log_prob_w[i][j] = 0;
}
ss =
new_lda_suffstats(model); // allocate sufficient
statistics
lda_suffstats* new_lda_suffstats(lda_model*
model)
{
ss->class_total =
malloc(sizeof(double)*num_topics);
ss->class_word
= malloc(sizeof(double*)*num_topics);
ss->class_total[i] = 0; // 各主题关系值的汇总
ss->class_word[i] =
malloc(sizeof(double)*num_terms); // 存储主题与单词的关系值
ss->class_word[i][j] = 0;
}
random_initialize_ss(ss, model);
void random_initialize_ss(lda_suffstats* ss,
lda_model* model)
{
ss->class_word[k][n] += 1.0/num_terms + myrand();
//每个词的概率为均匀分布
ss->class_total[k] +=
ss->class_word[k][n];
}
lda_mle(model,
ss, 0); // compute MLE lda model from
sufficient statistics
void lda_mle(lda_model* model, lda_suffstats* ss,
int estimate_alpha)
{
if
(ss->class_word[k][w] > 0)
model->log_prob_w[k][w] = log(ss->class_word[k][w])
- log(ss->class_total[k]);
else
model->log_prob_w[k][w] =
-100;
}
model->alpha =
INITIAL_ALPHA;
save_lda_model(model, filename);
void save_lda_model(lda_model* model, char*
model_root)
{
sprintf(filename,
"%s.beta", model_root);
fprintf(fileptr,
" %5.10f", model->log_prob_w[i][j]); //主题下单词的概率
sprintf(filename,
"%s.other", model_root);
fprintf(fileptr,
"num_topics %d\n", model->num_topics);
fprintf(fileptr,
"num_terms %d\n", model->num_terms);
fprintf(fileptr,
"alpha %5.10f\n", model->alpha);
}
// run expectation
maximization
while (((converged < 0) || (converged >
EM_CONVERGED) || (i <= 2)) && (i <=
EM_MAX_ITER))
{
zero_initialize_ss(ss, model)
void
zero_initialize_ss(lda_suffstats* ss, lda_model* model)
{
ss->class_total[k] = 0;
ss->class_word[k][w] = 0;
ss->num_docs =
0;
ss->alpha_suffstats = 0;
}
// e-step
for (d = 0; d
< corpus->num_docs; d++)
{
//perform inference on a
document and update sufficient statistics
likelihood +=
doc_e_step(&(corpus->docs[d]),var_gamma[d],phi,model,ss);
double doc_e_step(document* doc, double* gamma,
double** phi,lda_model* model, lda_suffstats* ss)
{
// posterior
inference, variational
inference
likelihood = lda_inference(doc,
model, gamma, phi);
double lda_inference(document* doc, lda_model*
model, double* var_gamma, double** phi)
{
// compute posterior
dirichlet
for (k = 0; k <
model->num_topics; k++)
{
var_gamma[k] = model->alpha
+ (doc->total/((double) model->num_topics));
//taylor approximation of first
derivative of the log gamma function
digamma_gam[k] =
digamma(var_gamma[k]);
for (n = 0; n
< doc->length; n++)
phi[n][k] =
1.0/model->num_topics;
}
var_iter = 0;
while ((converged >
VAR_CONVERGED) && ((var_iter < VAR_MAX_ITER)
|| (VAR_MAX_ITER == -1)))
{
var_iter++;
for (k = 0; k <
model->num_topics; k++)
{
oldphi[k] =
phi[n][k];
phi[n][k] = digamma_gam[k]
+ model->log_prob_w[k][doc->words[n]];
if (k > 0)
phisum =
log_sum(phisum, phi[n][k]);
else
phisum =
phi[n][k]; // note, phi is in log space
}
for (k = 0; k <
model->num_topics; k++)
{
phi[n][k] = exp(phi[n][k] -
phisum);
var_gamma[k]
= var_gamma[k] +
doc->counts[n]*(phi[n][k] - oldphi[k]);
digamma_gam[k] =
digamma(var_gamma[k]);
}
}
likelihood =
compute_likelihood(doc, model, phi, var_gamma);
// compute likelihood
bound
} // end lda_inference
// update sufficient
statistics
for (k = 0; k <
model->num_topics; k++)
{
gamma_sum +=
gamma[k];
ss->alpha_suffstats +=
digamma(gamma[k]);
}
ss->alpha_suffstats -=
model->num_topics * digamma(gamma_sum);
for (n = 0; n <
doc->length; n++)
{
for (k = 0; k <
model->num_topics; k++)
{
ss->class_word[k][doc->words[n]] +=
doc->counts[n]*phi[n][k];
ss->class_total[k] +=
doc->counts[n]*phi[n][k];
}
}
ss->num_docs =
ss->num_docs + 1;
return(likelihood);
} // end of e-step
// m-step
lda_mle(model, ss,
ESTIMATE_ALPHA);
void lda_mle(lda_model* model, lda_suffstats* ss,
int estimate_alpha)
{
for (k = 0; k <
model->num_topics; k++)
{
for (w = 0; w <
model->num_terms; w++)
{
if
(ss->class_word[k][w] > 0)
model->log_prob_w[k][w]
= log(ss->class_word[k][w])
- log(ss->class_total[k]);
else
model->log_prob_w[k][w] =
-100;
}
}
model->alpha =
opt_alpha(ss->alpha_suffstats, ss->num_docs, model->num_topics);
// newtons method
} // end of m-step
}
}
}
加载中,请稍候......