Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 120 additions & 1 deletion esl_mixdchlet.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@
*/
ESL_MIXDCHLET *
esl_mixdchlet_Create(int Q, int K)
{
return esl_mixdchlet_CreateForFitting(Q, K, 0);
}

ESL_MIXDCHLET *
esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric)
{
ESL_MIXDCHLET *dchl = NULL;
int status;
Expand All @@ -64,6 +70,7 @@ esl_mixdchlet_Create(int Q, int K)

dchl->Q = Q;
dchl->K = K;
dchl->isDnaStrandSymmetric = isDnaStrandSymmetric;
return dchl;

ERROR:
Expand Down Expand Up @@ -218,6 +225,25 @@ mixdchlet_pack_paramvector(ESL_MIXDCHLET *dchl, double *p)
int j = 0; /* counter in packed parameter vector <p> */
int k,a; /* indices over components, residues */

if (dchl->isDnaStrandSymmetric)
{
for (k = 0; k < dchl->Q - 1; k += 2) // paired components
p[j++] = log(dchl->q[k] * 2); // Prob(component) * 2 = Prob(pair)

if (k == dchl->Q - 1) // unpaired self-symmetric component
p[j++] = log(dchl->q[k]);

for (k = 0; k < dchl->Q - 1; k += 2) // paired components
for (a = 0; a < dchl->K; a++)
p[j++] = log(dchl->alpha[k][a]);

if (k == dchl->Q - 1) // unpaired self-symmetric component
for (a = 0; a < dchl->K / 2; a++)
p[j++] = log(dchl->alpha[k][a]);

return;
}

/* mixture coefficients */
for (k = 0; k < dchl->Q; k++)
p[j++] = log(dchl->q[k]);
Expand All @@ -240,6 +266,27 @@ mixdchlet_unpack_paramvector(double *p, ESL_MIXDCHLET *dchl)
int j = 0; /* counter in packed parameter vector <p> */
int k,a; /* indices over components, residues */

if (dchl->isDnaStrandSymmetric)
{
for (k = 0; k < dchl->Q - 1; k += 2) // paired components
dchl->q[k] = dchl->q[k+1] = exp(p[j++]) / 2;

if (k == dchl->Q - 1) // unpaired self-symmetric component
dchl->q[k] = exp(p[j++]);

esl_vec_DNorm(dchl->q, dchl->Q);

for (k = 0; k < dchl->Q - 1; k += 2) // paired components
for (a = 0; a < dchl->K; a++)
dchl->alpha[k][a] = dchl->alpha[k+1][dchl->K - 1 - a] = exp(p[j++]);

if (k == dchl->Q - 1) // unpaired self-symmetric component
for (a = 0; a < dchl->K / 2; a++)
dchl->alpha[k][a] = dchl->alpha[k][dchl->K - 1 - a] = exp(p[j++]);

return;
}

/* mixture coefficients */
for (k = 0; k < dchl->Q; k++)
dchl->q[k] = exp(p[j++]);
Expand Down Expand Up @@ -278,6 +325,7 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp)
double sum_alpha; // |alpha_k|
double sum_c; // |c_i|
double psi1; // \Psi(c_ia + alpha_ka)
double psi1b; // \Psi(c_ia' + alpha_ka)
double psi2; // \Psi( |c_i| + |alpha_k| )
double psi3; // \Psi( |alpha_k| )
double psi4; // \Psi( alpha_ka )
Expand All @@ -289,9 +337,61 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp)
{
mixdchlet_postq(dchl, data->c[i]); // d->postq[q] is now P(q | c_i, theta)
sum_c = esl_vec_DSum(data->c[i], dchl->K); // |c_i|
j = 0;

if (dchl->isDnaStrandSymmetric)
{
for (k = 0; k < dchl->Q - 1; k += 2) // paired components
dp[j++] -= dchl->postq[k] + dchl->postq[k+1] - 2 * dchl->q[k];

if (k == dchl->Q - 1) // unpaired self-symmetric component
dp[j++] -= dchl->postq[k] - dchl->q[k];

for (k = 0; k < dchl->Q - 1; k += 2) // paired components
{
double strand_weights[2] = {0, 0};
for (a = 0; a < dchl->K; a++)
{
int b = dchl->K - 1 - a; // complement of DNA base a
strand_weights[0] += lgamma(dchl->alpha[k][a] + data->c[i][a]);
strand_weights[1] += lgamma(dchl->alpha[k][a] + data->c[i][b]);
}
esl_vec_DLogNorm(strand_weights, 2);

sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K);
esl_stats_Psi( sum_alpha + sum_c, &psi2);
esl_stats_Psi( sum_alpha, &psi3);
for (a = 0; a < dchl->K; a++)
{
int b = dchl->K - 1 - a; // complement of DNA base a
esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1);
esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b);
esl_stats_Psi( dchl->alpha[k][a], &psi4);
dp[j++] -= dchl->alpha[k][a] * (dchl->postq[k] + dchl->postq[k+1])
* (strand_weights[0] * psi1 + strand_weights[1] * psi1b - psi2 + psi3 - psi4);
}
}

if (k == dchl->Q - 1) // unpaired self-symmetric component
{
sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K);
esl_stats_Psi( sum_alpha + sum_c, &psi2);
esl_stats_Psi( sum_alpha, &psi3);
for (a = 0; a < dchl->K / 2; a++)
{
int b = dchl->K - 1 - a; // complement of DNA base a
esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1);
esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b);
esl_stats_Psi( dchl->alpha[k][a], &psi4);
dp[j++] -= dchl->alpha[k][a] * dchl->postq[k]
* (psi1 + psi1b - 2 * psi2 + 2 * psi3 - 2 * psi4);
}
}

continue;
}

/* mixture coefficient gradient */
j = 0;
for (k = 0; k < dchl->Q; k++)
dp[j++] -= dchl->postq[k] - dchl->q[k];

Expand Down Expand Up @@ -351,6 +451,8 @@ esl_mixdchlet_Fit(double **c, int N, ESL_MIXDCHLET *dchl, double *opt_nll)
double fx;
int status;

if (dchl->isDnaStrandSymmetric) nparam = (nparam + 1) / 2;

cfg = esl_min_cfg_Create(nparam);
if (! cfg) { status = eslEMEM; goto ERROR; }
cfg->cg_rtol = 3e-5;
Expand Down Expand Up @@ -419,6 +521,23 @@ esl_mixdchlet_Sample(ESL_RANDOMNESS *rng, ESL_MIXDCHLET *dchl)
{
int k,a;

if (dchl->isDnaStrandSymmetric)
{
esl_dirichlet_DSampleUniform(rng, dchl->Q, dchl->q);
for (k = 0; k < dchl->Q - 1; k += 2) // paired components
dchl->q[k] = dchl->q[k+1] = (dchl->q[k] + dchl->q[k+1]) / 2;

for (k = 0; k < dchl->Q - 1; k += 2) // paired components
for (a = 0; a < dchl->K; a++)
dchl->alpha[k][a] = dchl->alpha[k+1][dchl->K - 1 - a] = 2.0 * esl_rnd_UniformPositive(rng);

if (k == dchl->Q - 1) // unpaired self-symmetric component
for (a = 0; a < dchl->K / 2; a++)
dchl->alpha[k][a] = dchl->alpha[k][dchl->K - 1 - a] = 2.0 * esl_rnd_UniformPositive(rng);

return eslOK;
}

esl_dirichlet_DSampleUniform(rng, dchl->Q, dchl->q);
for (k = 0; k < dchl->Q; k++)
for (a = 0; a < dchl->K; a++)
Expand Down
2 changes: 2 additions & 0 deletions esl_mixdchlet.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ typedef struct {
double **alpha; /* Dirichlet params alpha[0..Q-1][0..K-1] */
int Q; /* number of mixtures, e.g. 9 for Sjolander */
int K; /* alphabet size, e.g. 20 */
int isDnaStrandSymmetric; /* is it DNA-strand-symmetric for A C G T? */

double *postq; /* temp space 0..Q-1: for posterior P(k|c) for example */
/*::cexcerpt::dirichlet_mixdchlet::end::*/
} ESL_MIXDCHLET;


extern ESL_MIXDCHLET *esl_mixdchlet_Create(int Q, int K);
extern ESL_MIXDCHLET *esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric);
extern void esl_mixdchlet_Destroy(ESL_MIXDCHLET *dchl);

extern double esl_mixdchlet_logp_c (ESL_MIXDCHLET *dchl, double *c);
Expand Down
54 changes: 52 additions & 2 deletions esl_mixdchlet.tex
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
\documentclass[11pt]{article}
\setcounter{secnumdepth}{0}

\usepackage{amsmath}
\usepackage{relsize}

\newcommand{\mono}[1]{{\smaller\texttt{#1}}} % literal (to be typed): code, program names
Expand Down Expand Up @@ -38,7 +39,7 @@ \section{Fitting a mixture Dirichlet to counts}
\[
P(c_i \mid \alpha_k) = \frac{ |c_i|! }
{ \prod_a c_{ia}! }
\frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ia} \right) }
\frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) }
{ \Gamma ( |c_i + \alpha_k| ) }
\frac{ \Gamma ( |\alpha_k| ) }
{ \prod_a \Gamma \left( \alpha_{ka} \right) }
Expand Down Expand Up @@ -91,6 +92,55 @@ \section{Fitting a mixture Dirichlet to counts}
maximizer. The implementation provides the negative log likelihood
and the negative gradient to the CG routine.

\subsection{Fitting a mixture Dirichlet with DNA strand symmetry}

\end{document}
Here, each count vector has $K=4$ counts of DNA bases. For a
strand-symmetric mixture Dirichlet, each component must be either
self-symmetric: $\alpha_{ka} = \alpha_{ka'}$ (where $a'$ is the
complement of base $a$), or paired with a symmetric component:
$\alpha_{ka} = \alpha_{k'a'}$ and $q_k = q_{k'}$ (where $k$ and $k'$
indicate paired components). Define $R$ to be the number of pairs
plus the number of unpaired components. It's convenient to make these
definitions:
\[
r_k = \begin{cases}
2 q_k & \text{for a paired component (i.e., the prior probability of the pair),} \\
q_k & \text{for an unpaired component}
\end{cases}
\]
\[
\hat{P}(k \mid \theta, c_i) = \begin{cases}
P(k \mid \theta, c_i) + P(k' \mid \theta, c_i) & \text{for a paired component,}\\
P(k \mid \theta, c_i) & \text{for an unpaired component}
\end{cases}
\]
Here, the number of $\lambda$ parameters is $R$, and the number of
$\beta$ parameters is: $K \times \text{(number of components) / 2}$.
We can define $\beta_{ka}$ as above, and $\lambda_k$ like this:
\begin{eqnarray*}
r_k & = & \frac{ e^{\lambda_k} } { \sum_{m=1}^R e^{\lambda_m} } \,.
\end{eqnarray*}
Then,
\begin{eqnarray*}
\frac{\partial L}{\partial \lambda_k} &=& \sum_i \bigl( \hat{P}(k \mid \theta, c_i) - r_k \bigr) \\
\frac{\partial L}{\partial \beta_{ka}} &=&
\begin{cases}
\sum_i \alpha_{ka} \hat{P}(k \mid \theta, c_i)
(X_{ika} + Z_{ika}) & \text{for a paired component,} \\
\sum_i \alpha_{ka} \hat{P}(k \mid \theta, c_i)
(Y_{ika} + 2 Z_{ika}) & \text{for an unpaired component}
\end{cases}
\end{eqnarray*}
where
\begin{eqnarray*}
X_{ika} & = &
\frac{ \Psi \left( c_{ia} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) +
\Psi \left( c_{ia'} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) }
{ \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) } \,,\\
Y_{ika} & = & \Psi \left( c_{ia} + \alpha_{ka} \right) + \Psi \left( c_{ia'} + \alpha_{ka} \right) \,,\\
Z_{ika} & = & - \Psi \left( | c_i | + | \alpha_k | \right)
+ \Psi \left( | \alpha_k | \right)
- \Psi \left( \alpha_{ka} \right) \,.
\end{eqnarray*}

\end{document}
5 changes: 5 additions & 0 deletions miniapps/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ LDFLAGS = @LDFLAGS@
DEFS = @DEFS@
LIBS = -leasel @LIBGSL@ @LIBS@ @PTHREAD_LIBS@ -lm

PROGS = esl-mixdchlet

## list of the miniapp commands to compile.
#
SUBCMDOBJS = \
Expand Down Expand Up @@ -97,6 +99,9 @@ check: ${PROGS} easel
easel: % : %.c ../libeasel.a ${SUBCMDOBJS}
${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${SUBCMDOBJS} ${LIBS}

${PROGS}: % : %.c ../libeasel.a
${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${LIBS}

${SUBCMDOBJS}: %.o : %.c ../libeasel.a
${QUIET_CC}${CC} -I. -I.. -I${srcdir} -I${srcdir}/.. ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${SIMD_CFLAGS} ${DEFS} -c $<

Expand Down
4 changes: 3 additions & 1 deletion miniapps/esl-mixdchlet.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ static ESL_OPTIONS fit_options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
{ "-d", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "fit a DNA strand-symmetric mixture to A C G T counts", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};

Expand All @@ -119,13 +120,14 @@ cmd_fit(const char *topcmd, const ESL_SUBCMD *sub, int argc, char **argv)
{
ESL_GETOPTS *go = esl_subcmd_CreateDefaultApp(topcmd, sub, fit_options, argc, argv, /*custom opthelp?:*/NULL);
ESL_RANDOMNESS *rng = esl_randomness_Create( esl_opt_GetInteger(go, "-s"));
int isDna = esl_opt_GetBoolean(go, "-d"); // a complement-symmetric mixture Dirichlet for DNA base counts?
int Q = strtol(esl_opt_GetArg(go, 1), NULL, 10); // number of mixture components
int K = strtol(esl_opt_GetArg(go, 2), NULL, 10); // size of probability/parameter vectors - length of count vectors
char *ctfile = esl_opt_GetArg(go, 3); // count file to input
char *outfile = esl_opt_GetArg(go, 4); // mixture Dirichlet file output
ESL_FILEPARSER *efp = NULL; // open fileparser for reading
FILE *ofp = NULL; // open output file for writing
ESL_MIXDCHLET *dchl = esl_mixdchlet_Create(Q,K); // mixture Dirichlet being estimated
ESL_MIXDCHLET *dchl = esl_mixdchlet_CreateForFitting(Q,K,isDna); // mixture Dirichlet being estimated
int Nalloc = 1024; // initial allocation for ct[]
double **ct = esl_mat_DCreate(Nalloc, K); // count vectors, [0..N-1][0..K-1]
int N = 0; // number of count vectors so far
Expand Down