this is for holding javascript data
Wen Jenny Shi deleted file section_subsectionPr.tex
over 9 years ago
Commit id: 3a5921b41ec3473dbdfe37ed9ea6d8afb4cf105d
deletions | additions
diff --git a/section_subsectionPr.tex b/section_subsectionPr.tex
deleted file mode 100644
index 1f7b6c6..0000000
--- a/section_subsectionPr.tex
+++ /dev/null
...
\section{Supplementary materials}
\subsection{Proof of binary splitting}
In this subsection we consider the simplest case. Suppose there are only two genome positions to be clustered, $Y=[Y_1,\; Y_2]$. If they share the same probability parameter, then the likelihood of the two share the same parameter is one when the numbers of observations at the two sites $m_1$ and $m_2$ are large.
Without loss of generality, assume $c_1=c_2=1$, and then the marginal posterior likelihood ratio of splitting the two over current state on the log scale is the following:
\begin{eqnarray*}
LR&=&\log(\pi(c_1=1,c_2=2|Y))-\log(\pi(c_1=1,c_2=1|Y))\\
&=&\sumj\left[\log\Gamma(y_1^j+1/25)+\log\Gamma(y_2^j+1/25) \right]-\log\Gamma(m_1+1/5)-\log\Gamma(m_2+1/5)\\
&&-\sumj\log\Gamma(y_1^j+y_2^j+1/25)+\log\Gamma(m_1+m2+1/5)
\end{eqnarray*}
Stirling's formula provides the following approximation:
$\log\Gamma(z)\approx\frac{1}{2}\log(2\pi)-\frac{1}{2}\log z+z\log z-z,$
Therefore,
\begin{eqnarray*}
&&LR\\
&\approx&\sumj\left[\frac{1}{2}\log(2\pi)-\frac{1}{2}(y_1^j+1/25)-\frac{1}{2}(y_2^j+1/25)+\frac{1}{2}(y_1^j+y_2^j+1/25) +(y_1^j+1/25)\log(y_1^j+1/25)\right. \\
&&\left.+(y_2^j+1/25)\log(y_2^j+1/25)-(y_1^j+y_2^j+1/25)\log(y_1^j+y_2^j+1/25)-1/25 \right]-\frac{1}{2}\log(2\pi)\\
&&+\frac{1}{2}\log(m_1+1/5)+\frac{1}{2}\log(m_2+1/5)-\frac{1}{2}\log(m_1+m_2+1/5)-(m_1+1/5)\log(m_1+1/5)\\
&&-(m_2+1/5)\log(m_2+1/5)+(m_1+m_2+1/5)\log(m_1+m_2+1/5)+1/5\\
&=&2\log(2\pi)+\sumj\left[(y_1^j-23/50)\log(y_1^j+1/25)+(y_2^j-23/50)\log(y_2^j+1/25)\right.\\
&&\left.-(y_1^j+y_2^j-23/50)\log(y_1^j+y_2^j+1/25) \right]+(m_1+m_2-3/10)\log(m_1+m_2+1/5)\\
&&-(m_1-3/10)\log(m_1+1/5)-(m_2-3/10)\log(m_2+1/5)
\end{eqnarray*}
Under null hypothesis that $Y_1$ and $Y_2$ follow the same distribution, i.e. they share the same probability parameter. Denote the comment probability parameter as $P=(p^1,\cdots,p^5)$. Then the normal approximation of the multinomial random variables are
$y_i^j\approx m_ip^j+\sqrt{m_i}z_i^j+ \Op(\sqrt{m_i}), \;\textit{for }i=1,2;\;j=1,\cdots,5,$
where $z_i^j$'s are standard normal random variables and $\sumj z_i^j=0$ for $i=1,2$.
Hence,
\begin{eqnarray*}
&&LR\\
&\approx&2\log(2\pi)+(m_1+m_2-3/10)\log(m_1+m_2+1/5)-(m_1-3/10)\log(m_1+1/5)\\
&&-(m_2-3/10)\log(m_2+1/5)+\sumj\left[(m_1p^j+\sqrt{m_1}z_1^j-23/50)\log(m_1p^j+\sqrt{m_1}z_1^j+1/25)\right.\\
&&+(m_2p^j+\sqrt{m_2}z_2^j-23/50)\log(m_2p^j+\sqrt{m_2}z_2^j+1/25)\\
&&\left.-(m_1p^j+\sqrt{m_1}z_1^j+m_2p^j+\sqrt{m_2}z_2^j-23/50)\log(m_1p^j+\sqrt{m_1}z_1^j+m_2p^j+\sqrt{m_2}z_2^j+1/25) \right]\\
&=&2\log(2\pi)+(m_1+m_2-3/10)\left[\log(m_1+m_2)+\log\left(1+\frac{1/5}{m_1+m_2}\right)\right]\\
&&-(m_1-3/10)\left[\log m_1+\log\left(1+\frac{1/5}{m_1}\right)\right]-(m_2-3/10)\left[\log m_2+\log\left(1+\frac{1/5}{m_2}\right)\right]\\
&&+\sumj\left\{(m_1p^j+\sqrt{m_1}z_1^j-23/50)\left[\log(m_1p^j)+\log\left(1+\frac{\sqrt{m_1}z_1^j+1/25}{m_1p^j}\right)\right]\right.\\
&&+(m_2p^j+\sqrt{m_2}z_2^j-23/50)\left[\log(m_2p^j)+\log\left(1+\frac{\sqrt{m_2}z_2^j+1/25}{m_2p^j}\right)\right]\\
&&-((m_1+m_2)p^j+\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j-23/50)\left[\log((m_1+m_2)p^j)\right.\\
&&\left.\left.+\log\left(1+\frac{\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j+1/25}{(m_1+m_2)p^j}\right) \right] \right\}\\
&=&2\log(2\pi)+2\log\left(\frac{1}{m_1}+\frac{1}{m_2} \right)+\left(m_1+m_2-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m_1+m_2}\right)\\
&&-\left(m_1-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m_1}\right)-\left(m_2-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m_2}\right)\\
&&+\sumj\left\{\left(m_1p^j+\sqrt{m_1}z_1^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m_1}z_1^j+1/25}{m_1p^j}\right)\right.\\
&&+\left(m_2p^j+\sqrt{m_2}z_2^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m_2}z_2^j+1/25}{m_2p^j}\right)\\
&&\left.-\left((m_1+m_2)p^j+\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j+1/25}{(m_1+m_2)p^j}\right) \right\}
\end{eqnarray*}
Note that, in general, by L'Hopital's rule, as $m_i\rightarrow \infty$,
$\sqrt{m_i}\log\left(1+\frac{\sqrt{m_i}z_i^j+1/25}{m_ip^j} \right)=\frac{\log\left(1+\frac{\sqrt{m_i}z_i^j+1/25}{m_ip^j} \right)}{1/\sqrt{m_i}}\longrightarrow\frac{z_i^j}{p^j}, \;\textit{for }i=1,2;\;j=1,\cdots,5.$
Under the assumption that $m_1$ and $m_2$ are increasing at the same rate, let $m_1=m$ and $m_2=cm$, for some $c>0.$ Then as $m\rightarrow \infty$,
\begin{eqnarray*}
&&(\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j)\log\left(1+\frac{\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j+1/25}{(m_1+m_2)p^j}\right)\\
&=&(z_1^j+\sqrt{c}z_2^j)\sqrt{m}\log\left(1+\frac{\sqrt{m}(z_1^j+\sqrt{c}z_2^j)+1/25}{(1+c)mp^j}\right)\\
&\longrightarrow&\frac{(z_1^j+\sqrt{c}z_2^j)^2}{(1+c)p^j}, \;\textit{for }j=1,\cdots,5.
\end{eqnarray*}
Therefore, as $m\rightarrow \infty$, the log likelihood ratio
\begin{eqnarray*}
&&LR\\
&\approx&2\log(2\pi)+2\log\frac{1+c}{cm}+\left(m(1+c)-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m(1+c)}\right)\\
&&-\left(m-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m}\right)-\left(cm-\frac{3}{10}\right)\log\left(1+\frac{1/5}{cm}\right)\\
&&+\sumj\left\{\left(m p^j+\sqrt{m}z_1^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m}z_1^j+1/25}{mp^j}\right)\right.\\
&&+\left(cmp^j+\sqrt{cm}z_2^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{cm}z_2^j+1/25}{cmp^j}\right)\\
&&\left.-\left((1+c)mp^j+\sqrt{m}z_1^j+\sqrt{cm}z_2^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m}z_1^j+\sqrt{cm}z_2^j+1/25}{(1+c)mp^j}\right) \right\}\\
&\longrightarrow&-\infty
\end{eqnarray*}
Therefore, $Y_1$ and $Y_2$ have the same cluster label almost surely.
\subsection{H1N1 Ht plots}