this is for holding javascript data
Steven Roberts added post
about 9 years ago
Commit id: 930755e1cb74fc2b12f80c8f706de6d92bade5c5
deletions | additions
diff --git a/ipynb/Array-feature-overlap-02.ipynb b/ipynb/Array-feature-overlap-02.ipynb
new file mode 100644
index 0000000..223ff3d
--- /dev/null
+++ b/ipynb/Array-feature-overlap-02.ipynb
...
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Re-defining canonical C gigas Genome Tracks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"via Ensembl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"scaffold44098\tdust\trepeat_region\t518076\t518099\t.\t.\t.\tName=dust;class=dust;type=Dust\r\n",
"scaffold44098\tdust\trepeat_region\t519261\t519281\t.\t.\t.\tName=dust;class=dust;type=Dust\r\n",
"scaffold44098\ttrf\trepeat_region\t519261\t519281\t.\t.\t.\tName=trf;class=trf;repeat_consensus=AT;type=Tandem repeats\r\n"
]
}
],
"source": [
"!tail -3 /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"186890 CDS\r\n",
" 5 RNA\r\n",
"189468 exon\r\n",
"26114 gene\r\n",
" 28 miRNA\r\n",
" 28 miRNA_gene\r\n",
"1410 pseudogenic_tRNA\r\n",
" 13 rRNA\r\n",
" 13 rRNA_gene\r\n",
"875275 repeat_region\r\n",
" 47 snRNA\r\n",
" 47 snRNA_gene\r\n",
" 20 snoRNA\r\n",
" 20 snoRNA_gene\r\n",
" 994 tRNA_gene\r\n",
"28523 transcript\r\n"
]
}
],
"source": [
"!cut -f 3 \\\n",
"/Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 5 EnsemblGenomes\tRNA\r\n",
"2530 EnsemblGenomes\texon\r\n",
" 13 EnsemblGenomes\tgene\r\n",
" 28 EnsemblGenomes\tmiRNA\r\n",
" 28 EnsemblGenomes\tmiRNA_gene\r\n",
"1410 EnsemblGenomes\tpseudogenic_tRNA\r\n",
" 13 EnsemblGenomes\trRNA\r\n",
" 13 EnsemblGenomes\trRNA_gene\r\n",
" 47 EnsemblGenomes\tsnRNA\r\n",
" 47 EnsemblGenomes\tsnRNA_gene\r\n",
" 20 EnsemblGenomes\tsnoRNA\r\n",
" 20 EnsemblGenomes\tsnoRNA_gene\r\n",
" 994 EnsemblGenomes\ttRNA_gene\r\n",
"2422 EnsemblGenomes\ttranscript\r\n",
"186890 GigaDB\tCDS\r\n",
"186938 GigaDB\texon\r\n",
"26101 GigaDB\tgene\r\n",
"26101 GigaDB\ttranscript\r\n",
"650376 dust\trepeat_region\r\n",
"224899 trf\trepeat_region\r\n"
]
}
],
"source": [
"!cut -f 2,3 \\\n",
"/Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"scaffold44098\tprotein_coding\tCDS\t509746\t510288\t.\t-\t0\t gene_id \"CGI_10017729\"; gene_version \"1\"; transcript_id \"EKC17988\"; transcript_version \"1\"; exon_number \"2\"; protein_id \"EKC17988\";\r\n",
"scaffold44098\tprotein_coding\texon\t514550\t514690\t.\t-\t.\t gene_id \"CGI_10017729\"; gene_version \"1\"; transcript_id \"EKC17988\"; transcript_version \"1\"; exon_number \"1\"; seqedit \"false\";\r\n",
"scaffold44098\tprotein_coding\tCDS\t514550\t514690\t.\t-\t0\t gene_id \"CGI_10017729\"; gene_version \"1\"; transcript_id \"EKC17988\"; transcript_version \"1\"; exon_number \"1\"; protein_id \"EKC17988\";\r\n",
"scaffold44098\tprotein_coding\tstart_codon\t514688\t514690\t.\t-\t0\t gene_id \"CGI_10017729\"; gene_version \"1\"; transcript_id \"EKC17988\"; transcript_version \"1\"; exon_number \"1\";\r\n",
"scaffold44098\tprotein_coding\texon\t514859\t515511\t.\t-\t.\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"2\"; seqedit \"false\";\r\n",
"scaffold44098\tprotein_coding\tstop_codon\t514859\t514861\t.\t-\t0\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"2\";\r\n",
"scaffold44098\tprotein_coding\tCDS\t514862\t515511\t.\t-\t2\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"2\"; protein_id \"EKC17989\";\r\n",
"scaffold44098\tprotein_coding\texon\t515871\t515877\t.\t-\t.\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"1\"; seqedit \"false\";\r\n",
"scaffold44098\tprotein_coding\tCDS\t515871\t515877\t.\t-\t0\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"1\"; protein_id \"EKC17989\";\r\n",
"scaffold44098\tprotein_coding\tstart_codon\t515875\t515877\t.\t-\t0\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"1\";\r\n"
]
}
],
"source": [
"!tail /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gtf"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2 RNase_MRP_RNA\texon\r\n",
" 1 RNase_P_RNA\texon\r\n",
" 10 SRP_RNA\texon\r\n",
" 28 miRNA\texon\r\n",
" 5 misc_RNA\texon\r\n",
" 48 nontranslating_CDS\texon\r\n",
"186890 protein_coding\tCDS\r\n",
"186890 protein_coding\texon\r\n",
"25587 protein_coding\tstart_codon\r\n",
"26087 protein_coding\tstop_codon\r\n",
" 13 rRNA\texon\r\n",
" 47 snRNA\texon\r\n",
" 20 snoRNA\texon\r\n",
" 994 tRNA\texon\r\n",
"1410 tRNA_pseudogene\texon\r\n"
]
}
],
"source": [
"!cut -f 2,3 \\\n",
"/Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gtf \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# via GigaDB aka version9"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff <==\r\n",
"scaffold999\tflankbed\tpromoter\t99703\t100702\t.\t-\t.\tID=CGI_10006972;\r",
"\r\n",
"scaffold999\tflankbed\tpromoter\t106744\t107743\t.\t+\t.\tID=CGI_10006973;\r",
"\r\n",
"\r\n",
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_CG.gff <==\r\n",
"scaffold38980\tfuzznuc\tnucleotide_motif\t63903\t63904\t2\t+\t.\tID=scaffold38980.744;note=*pat pattern:CG\r\n",
"scaffold38980\tfuzznuc\tnucleotide_motif\t64051\t64052\t2\t+\t.\tID=scaffold38980.745;note=*pat pattern:CG\r\n",
"\r\n",
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-TANDEMREPEAT.gff <==\r\n",
"scaffold999\tTRF\tTandem_Repeat\t153009\t153196\t189\t+\t.\t.\r\n",
"scaffold999\tTRF\tTandem_Repeat\t166754\t166792\t69\t+\t.\t.\r\n",
"\r\n",
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff <==\r\n",
"scaffold1009\tWUBlastX\tDNA_TcMar-Tc2\t1790325\t1790603\t20\t+\t.\t.\r\n",
"scaffold983\tWUBlastX\tDNA_TcMar-Tc1\t369636\t369770\t26\t-\t.\t.\r\n",
"\r\n",
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE.gff <==\r\n",
"scaffold1009\tWUBlastX\tDNA_TcMar-Tc2\t1790325\t1790603\t20\t+\t.\t.\r\n",
"scaffold983\tWUBlastX\tDNA_TcMar-Tc1\t369636\t369770\t26\t-\t.\t.\r\n",
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TEx.gff <==\r\n",
"scaffold1009\tWUBlastX\tDNA_TcMar-Tc2\t1790325\t1790603\t20\t+\t.\t.\r\n",
"scaffold983\tWUBlastX\tDNA_TcMar-Tc1\t369636\t369770\t26\t-\t.\t.\r\n",
"\r\n",
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_exon.gff <==\r\n",
"scaffold22\tGLEAN\tCDS\t1870289\t1870360\t.\t-\t0\tParent=CGI_10028939;\r\n",
"scaffold22\tGLEAN\tCDS\t1869336\t1869428\t.\t-\t0\tParent=CGI_10028939;\r\n",
"\r\n",
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff <==\r\n",
"scaffold22\tGLEAN\tmRNA\t1863760\t1864161\t0.544455\t+\t.\tID=CGI_10028938;\r\n",
"scaffold22\tGLEAN\tmRNA\t1869336\t1885890\t0.999933\t-\t.\tID=CGI_10028939;\r\n",
"\r\n",
"==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_intron.gff <==\r\n",
"scaffold999\tsubtractBed\tintrn\t124997\t126011\t.\t+\t.\tParent=CGI_10006973;\r",
"\r\n",
"scaffold999\tsubtractBed\tintrn\t126144\t126616\t.\t+\t.\tParent=CGI_10006973;\r",
"\r\n"
]
}
],
"source": [
"!tail -2 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_*.gff"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 28023 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff\n",
" 10035701 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_CG.gff\n",
" 61319 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-TANDEMREPEAT.gff\n",
" 58468 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff\n",
" 119786 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE.gff\n",
" 58468 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TEx.gff\n",
" 196691 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_exon.gff\n",
" 28027 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff\n",
" 176049 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_intron.gff\n",
" 10762532 total\n"
]
}
],
"source": [
"!wc -l /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_*.gff"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Comparison"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lets see if can take all array and intersect with Ensembl gff"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1373 GigaDB\tCDS\r\n",
"1373 GigaDB\texon\r\n",
"8468 GigaDB\tgene\r\n",
"8468 GigaDB\ttranscript\r\n",
"1240 dust\trepeat_region\r\n",
" 975 trf\trepeat_region\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.2M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n",
"| cut -f 6,7 \\\n",
"| sort | uniq -c | sed '/#/d'\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2 EnsemblGenomes\texon\r\n",
" 1 EnsemblGenomes\tpseudogenic_tRNA\r\n",
" 1 EnsemblGenomes\ttRNA_gene\r\n",
" 2 EnsemblGenomes\ttranscript\r\n",
"1177 GigaDB\tCDS\r\n",
"1177 GigaDB\texon\r\n",
"8491 GigaDB\tgene\r\n",
"8491 GigaDB\ttranscript\r\n",
"1320 dust\trepeat_region\r\n",
" 873 trf\trepeat_region\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.4M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n",
"| cut -f 6,7 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 1 EnsemblGenomes\texon\r\n",
" 1 EnsemblGenomes\tsnRNA\r\n",
" 1 EnsemblGenomes\tsnRNA_gene\r\n",
" 947 GigaDB\tCDS\r\n",
" 948 GigaDB\texon\r\n",
"9689 GigaDB\tgene\r\n",
"9689 GigaDB\ttranscript\r\n",
"1591 dust\trepeat_region\r\n",
" 864 trf\trepeat_region\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.6M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n",
"| cut -f 6,7 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 5 EnsemblGenomes\tRNA\r\n",
" 444 EnsemblGenomes\texon\r\n",
" 6 EnsemblGenomes\tgene\r\n",
" 2 EnsemblGenomes\tmiRNA\r\n",
" 2 EnsemblGenomes\tmiRNA_gene\r\n",
" 259 EnsemblGenomes\tpseudogenic_tRNA\r\n",
" 14 EnsemblGenomes\tsnRNA\r\n",
" 14 EnsemblGenomes\tsnRNA_gene\r\n",
" 6 EnsemblGenomes\tsnoRNA\r\n",
" 6 EnsemblGenomes\tsnoRNA_gene\r\n",
" 152 EnsemblGenomes\ttRNA_gene\r\n",
" 422 EnsemblGenomes\ttranscript\r\n",
"157279 GigaDB\tCDS\r\n",
"157307 GigaDB\texon\r\n",
"600445 GigaDB\tgene\r\n",
"600445 GigaDB\ttranscript\r\n",
"56210 dust\trepeat_region\r\n",
"42390 trf\trepeat_region\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a /Users/sr320/git-repos/paper-Temp-stress/ipynb/data/array-design/OID40453_probe_locations.gff \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n",
"| cut -f 11,12 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TEs"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 383 WUBlastX\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.2M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff \\\n",
"| cut -f 6 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 254 WUBlastX\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.4M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff \\\n",
"| cut -f 6 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 168 WUBlastX\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.6M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff \\\n",
"| cut -f 6 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10322 WUBlastX\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a /Users/sr320/git-repos/paper-Temp-stress/ipynb/data/array-design/OID40453_probe_locations.gff \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff \\\n",
"| cut -f 11 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Promoters"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 976 flankbed\tpromoter\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.2M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff \\\n",
"| cut -f 6,7 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 992 flankbed\tpromoter\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.4M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff \\\n",
"| cut -f 6,7 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1248 flankbed\tpromoter\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.6M_sig.bedGraph \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff \\\n",
"| cut -f 6,7 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"66368 flankbed\r\n"
]
}
],
"source": [
"!intersectbed \\\n",
"-wb \\\n",
"-a /Users/sr320/git-repos/paper-Temp-stress/ipynb/data/array-design/OID40453_probe_locations.gff \\\n",
"-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff \\\n",
"| cut -f 11 \\\n",
"| sort | uniq -c | sed '/#/d'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Plot"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"oys2\toys4\toys6\tProbes\n",
"gene\t8468\t8491\t9689\t600445\n",
"exon\t1373\t1177\t948\t157307\n",
"intron\t7095\t7314\t8741\t443138\n",
"dust repeat\t1240\t1320\t1591\t56210\n",
"trf repeat\t975\t873\t864\t42390\n",
"TE-blast\t383\t254\t168\t10322\n",
"promoter\t976\t992\t1248\t66368\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# Analysis of one proportion"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"http://nbviewer.ipython.org/github/thomas-haslwanter/statsintro/blob/master/ipynb/70_compGroups.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline\n",
"import scipy.stats as stats"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ONE PROPORTION\n",
"The confidence interval for the given sample is 0.224 to 0.226\n"
]
}
],
"source": [
"# Get the data Probes exon\n",
"numTotal = 697753\n",
"numPositive = 157307\n",
"\n",
"# Calculate the confidence intervals\n",
"p = float(numPositive)/numTotal\n",
"se = sqrt(p*(1-p)/numTotal)\n",
"td = stats.t(numTotal-1)\n",
"ci = p + array([-1,1])*td.isf(0.025)*se\n",
"\n",
"# Print them\n",
"print('ONE PROPORTION')\n",
"print('The confidence interval for the given sample is {0:5.3f} to {1:5.3f}'.format(\n",
" ci[0], ci[1]))\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Chi-square test to a 2x2 table\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CHI SQUARE\n",
"The corrected chi2 value is 47.663, with p=0.000\n",
"The uncorrected chi2 value is 47.772, with p=0.000\n"
]
}
],
"source": [
"# Enter the data comparing Oyster 2; probes at intron\n",
"obs = array([[7095, 10028], [443138, 697753]])\n",
"\n",
"# Calculate the chi-square test\n",
"chi2_corrected = stats.chi2_contingency(obs, correction=True)\n",
"chi2_uncorrected = stats.chi2_contingency(obs, correction=False)\n",
"\n",
"# Print the result\n",
"print('CHI SQUARE')\n",
"print('The corrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_corrected[0], chi2_corrected[1]))\n",
"print('The uncorrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_uncorrected[0], chi2_uncorrected[1]))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CHI SQUARE\n",
"The corrected chi2 value is 1.597, with p=0.206\n",
"The uncorrected chi2 value is 1.616, with p=0.204\n"
]
}
],
"source": [
"# Enter the data comparing Oyster 2; probes at gene\n",
"obs = array([[8468, 10028], [600445, 697753]])\n",
"\n",
"# Calculate the chi-square test\n",
"chi2_corrected = stats.chi2_contingency(obs, correction=True)\n",
"chi2_uncorrected = stats.chi2_contingency(obs, correction=False)\n",
"\n",
"# Print the result\n",
"print('CHI SQUARE')\n",
"print('The corrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_corrected[0], chi2_corrected[1]))\n",
"print('The uncorrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_uncorrected[0], chi2_uncorrected[1]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
diff --git a/notes/where-are-they-now.txt b/notes/where-are-they-now.txt
new file mode 100644
index 0000000..79b058f
--- /dev/null
+++ b/notes/where-are-they-now.txt
...
Where are they now?
**tldr**
![bar](http://eagle.fish.washington.edu/cnidarian/skitch/Screenshot_3_13_15__9_20_AM_1AB34D8C.png)
---
In an effort to find out where heat stress induced differentially methylated loci are in the oyster genome (to ultimately inform on function) I have been using `bedtools` to see where the DMLs lie on the genome. As this was done on an array platform I also felt I need to take into consideration where probes were, noting that they were not randomly distributed across the genome but rather targetted to genes.
I have determined the proportion of DMLs (n=10028, 10148, 11690) for each oyster that fall within a given genomic feature and compared that to the proporiton of total probes (n=697753) that fall within each genomic feature. For example in just looking at Oyster 2 DMLs and DEGs ...
```
!intersectbed \
-wb \
-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.2M_sig.bedGraph \
-b /Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf \
| cut -f 6 \
| sort | uniq -c
!intersectbed \
-wb \
-a /Users/sr320/git-repos/paper-Temp-stress/ipynb/data/array-design/OID40453_probe_locations.gff \
-b /Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf \
| cut -f 11 \
| sort | uniq -c
```
output:
880 Cufflinks
117460 Cufflinks
```
# Enter the data comparing Oyster 2 then Probes
obs = array([[880, 10028], [117460, 697753]])
# Calculate the chi-square test
chi2_corrected = stats.chi2_contingency(obs, correction=True)
chi2_uncorrected = stats.chi2_contingency(obs, correction=False)
# Print the result
print('CHI SQUARE')
print('The corrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_corrected[0], chi2_corrected[1]))
print('The uncorrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_uncorrected[0], chi2_uncorrected[1]))
```
output:
CHI SQUARE
The corrected chi2 value is 352.138, with p=0.000
The uncorrected chi2 value is 352.654, with p=0.000
~ [jupyter notebook](http://nbviewer.ipython.org/github/sr320/paper-Temp-stress/blob/master/ipynb/Array-feature-overlap-04.ipynb)
---
To be honest I feel like I am missing some nuance in the analysis, however at this point I believe I will keep pushing through by seeing of the results break out based on whether the DML is hypo or hypermethylated. If you forgot hear is the breakdown.
Oyster | Hypo-methylated | Hyper-methylated | Hypo-3plus-merged | Hypo-3plus-merged
--- | --- | --- | --- | ---
2 | 7224 | 2803 | 108 | 4
4 | 6560 | 3587 | 48 | 10
6 | 7645 | 4044 | 53 | 9
This also sheds light on the fact that I am currently ignoring clustering (3-plus), something else to put on the list!