Wen Jenny Shi renamed section_subsectionPr.tex to Supplementary.tex  over 9 years ago

Commit id: 484bade8d3c9dc1075603c1bb6e1c86711fa15a4

deletions | additions      

         

\section{Supplementary materials}  \subsection{Proof of binary splitting}  In this subsection we consider the simplest case. Suppose there are only two genome positions to be clustered, $Y=[Y_1,\; Y_2]$. If they share the same probability parameter, then the likelihood of the two share the same parameter is one when the numbers of observations at the two sites $m_1$ and $m_2$ are large.   Without loss of generality, assume $c_1=c_2=1$, and then the marginal posterior likelihood ratio of splitting the two over current state on the log scale is the following:  \begin{eqnarray*}  LR&=&\log(\pi(c_1=1,c_2=2|Y))-\log(\pi(c_1=1,c_2=1|Y))\\  &=&\sumj\left[\log\Gamma(y_1^j+1/25)+\log\Gamma(y_2^j+1/25) \right]-\log\Gamma(m_1+1/5)-\log\Gamma(m_2+1/5)\\  &&-\sumj\log\Gamma(y_1^j+y_2^j+1/25)+\log\Gamma(m_1+m2+1/5)  \end{eqnarray*}  Stirling's formula provides the following approximation:  $\log\Gamma(z)\approx\frac{1}{2}\log(2\pi)-\frac{1}{2}\log z+z\log z-z,$  Therefore,  \begin{eqnarray*}  &&LR\\  &\approx&\sumj\left[\frac{1}{2}\log(2\pi)-\frac{1}{2}(y_1^j+1/25)-\frac{1}{2}(y_2^j+1/25)+\frac{1}{2}(y_1^j+y_2^j+1/25) +(y_1^j+1/25)\log(y_1^j+1/25)\right. \\  &&\left.+(y_2^j+1/25)\log(y_2^j+1/25)-(y_1^j+y_2^j+1/25)\log(y_1^j+y_2^j+1/25)-1/25 \right]-\frac{1}{2}\log(2\pi)\\  &&+\frac{1}{2}\log(m_1+1/5)+\frac{1}{2}\log(m_2+1/5)-\frac{1}{2}\log(m_1+m_2+1/5)-(m_1+1/5)\log(m_1+1/5)\\  &&-(m_2+1/5)\log(m_2+1/5)+(m_1+m_2+1/5)\log(m_1+m_2+1/5)+1/5\\  &=&2\log(2\pi)+\sumj\left[(y_1^j-23/50)\log(y_1^j+1/25)+(y_2^j-23/50)\log(y_2^j+1/25)\right.\\  &&\left.-(y_1^j+y_2^j-23/50)\log(y_1^j+y_2^j+1/25) \right]+(m_1+m_2-3/10)\log(m_1+m_2+1/5)\\  &&-(m_1-3/10)\log(m_1+1/5)-(m_2-3/10)\log(m_2+1/5)  \end{eqnarray*}  Under null hypothesis that $Y_1$ and $Y_2$ follow the same distribution, i.e. they share the same probability parameter. Denote the comment probability parameter as $P=(p^1,\cdots,p^5)$. Then the normal approximation of the multinomial random variables are  $y_i^j\approx m_ip^j+\sqrt{m_i}z_i^j+ \Op(\sqrt{m_i}), \;\textit{for }i=1,2;\;j=1,\cdots,5,$  where $z_i^j$'s are standard normal random variables and $\sumj z_i^j=0$ for $i=1,2$.  Hence,  \begin{eqnarray*}  &&LR\\  &\approx&2\log(2\pi)+(m_1+m_2-3/10)\log(m_1+m_2+1/5)-(m_1-3/10)\log(m_1+1/5)\\  &&-(m_2-3/10)\log(m_2+1/5)+\sumj\left[(m_1p^j+\sqrt{m_1}z_1^j-23/50)\log(m_1p^j+\sqrt{m_1}z_1^j+1/25)\right.\\  &&+(m_2p^j+\sqrt{m_2}z_2^j-23/50)\log(m_2p^j+\sqrt{m_2}z_2^j+1/25)\\  &&\left.-(m_1p^j+\sqrt{m_1}z_1^j+m_2p^j+\sqrt{m_2}z_2^j-23/50)\log(m_1p^j+\sqrt{m_1}z_1^j+m_2p^j+\sqrt{m_2}z_2^j+1/25) \right]\\  &=&2\log(2\pi)+(m_1+m_2-3/10)\left[\log(m_1+m_2)+\log\left(1+\frac{1/5}{m_1+m_2}\right)\right]\\  &&-(m_1-3/10)\left[\log m_1+\log\left(1+\frac{1/5}{m_1}\right)\right]-(m_2-3/10)\left[\log m_2+\log\left(1+\frac{1/5}{m_2}\right)\right]\\  &&+\sumj\left\{(m_1p^j+\sqrt{m_1}z_1^j-23/50)\left[\log(m_1p^j)+\log\left(1+\frac{\sqrt{m_1}z_1^j+1/25}{m_1p^j}\right)\right]\right.\\  &&+(m_2p^j+\sqrt{m_2}z_2^j-23/50)\left[\log(m_2p^j)+\log\left(1+\frac{\sqrt{m_2}z_2^j+1/25}{m_2p^j}\right)\right]\\  &&-((m_1+m_2)p^j+\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j-23/50)\left[\log((m_1+m_2)p^j)\right.\\  &&\left.\left.+\log\left(1+\frac{\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j+1/25}{(m_1+m_2)p^j}\right) \right] \right\}\\  &=&2\log(2\pi)+2\log\left(\frac{1}{m_1}+\frac{1}{m_2} \right)+\left(m_1+m_2-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m_1+m_2}\right)\\  &&-\left(m_1-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m_1}\right)-\left(m_2-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m_2}\right)\\  &&+\sumj\left\{\left(m_1p^j+\sqrt{m_1}z_1^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m_1}z_1^j+1/25}{m_1p^j}\right)\right.\\  &&+\left(m_2p^j+\sqrt{m_2}z_2^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m_2}z_2^j+1/25}{m_2p^j}\right)\\  &&\left.-\left((m_1+m_2)p^j+\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j+1/25}{(m_1+m_2)p^j}\right) \right\}  \end{eqnarray*}  Note that, in general, by L'Hopital's rule, as $m_i\rightarrow \infty$,  $\sqrt{m_i}\log\left(1+\frac{\sqrt{m_i}z_i^j+1/25}{m_ip^j} \right)=\frac{\log\left(1+\frac{\sqrt{m_i}z_i^j+1/25}{m_ip^j} \right)}{1/\sqrt{m_i}}\longrightarrow\frac{z_i^j}{p^j}, \;\textit{for }i=1,2;\;j=1,\cdots,5.$  Under the assumption that $m_1$ and $m_2$ are increasing at the same rate, let $m_1=m$ and $m_2=cm$, for some $c>0.$ Then as $m\rightarrow \infty$,  \begin{eqnarray*}  &&(\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j)\log\left(1+\frac{\sqrt{m_1}z_1^j+\sqrt{m_2}z_2^j+1/25}{(m_1+m_2)p^j}\right)\\  &=&(z_1^j+\sqrt{c}z_2^j)\sqrt{m}\log\left(1+\frac{\sqrt{m}(z_1^j+\sqrt{c}z_2^j)+1/25}{(1+c)mp^j}\right)\\  &\longrightarrow&\frac{(z_1^j+\sqrt{c}z_2^j)^2}{(1+c)p^j}, \;\textit{for }j=1,\cdots,5.  \end{eqnarray*}  Therefore, as $m\rightarrow \infty$, the log likelihood ratio   \begin{eqnarray*}  &&LR\\  &\approx&2\log(2\pi)+2\log\frac{1+c}{cm}+\left(m(1+c)-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m(1+c)}\right)\\  &&-\left(m-\frac{3}{10}\right)\log\left(1+\frac{1/5}{m}\right)-\left(cm-\frac{3}{10}\right)\log\left(1+\frac{1/5}{cm}\right)\\  &&+\sumj\left\{\left(m p^j+\sqrt{m}z_1^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m}z_1^j+1/25}{mp^j}\right)\right.\\  &&+\left(cmp^j+\sqrt{cm}z_2^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{cm}z_2^j+1/25}{cmp^j}\right)\\  &&\left.-\left((1+c)mp^j+\sqrt{m}z_1^j+\sqrt{cm}z_2^j-\frac{23}{50}\right)\log\left(1+\frac{\sqrt{m}z_1^j+\sqrt{cm}z_2^j+1/25}{(1+c)mp^j}\right) \right\}\\  &\longrightarrow&-\infty   \end{eqnarray*}  Therefore, $Y_1$ and $Y_2$ have the same cluster label almost surely.  \subsection{H1N1 Ht plots}