this is for holding javascript data
John Blischak Create example analysis so that tutorial is more realistic.
over 8 years ago
Commit id: 45f891b666e33edf1b1c6e61f55a3cf2240d536c
deletions | additions
diff --git a/code/analyze.R b/code/analyze.R
new file mode 100644
index 0000000..616f108
--- /dev/null
+++ b/code/analyze.R
...
#!/usr/bin/env Rscript
# Analyze peak size distribution across chromosomes.
data_dir <- "../data"
sites <- read.table(file.path(data_dir, "sites-union.bed"),
stringsAsFactors = FALSE)
colnames(sites) <- c("chr", "start", "end")
sites$chr <- factor(sites$chr, levels = paste0("chr", c(1:22, "X")))
sites$len <- sites$end - sites$start
sites_per_chr <- table(sites$chr)
pdf(file.path(data_dir, "sites-union.pdf"))
par(mfrow = c(3, 1))
hist(sites$len)
boxplot(len ~ chr, data = sites, las = 3)
barplot(sites_per_chr, las = 3)
dev.off()
diff --git a/code/clean.py b/code/clean.py
new file mode 100755
index 0000000..7b8c058
--- /dev/null
+++ b/code/clean.py
...
#!/usr/bin/env python
"""
Identify high-quality CTCF binding sites.
1. Remove peaks with low fold change in enrichment over control.
2. Keep only those regions bound by CTCF in all three tissue types.
"""
import pybedtools
import math
data = "../data"
# For details of MACS2 output format:
# https://github.com/taoliu/MACS#output-files
epithelial = pybedtools.BedTool(data + "/epithelial/epithelial_peaks.narrowPeak")
proximal_tube = pybedtools.BedTool(data + "/proximal-tube/proximal-tube_peaks.narrowPeak")
kidney = pybedtools.BedTool(data + "/kidney/kidney_peaks.narrowPeak")
def filter_fold_change(feature, fc = 1):
"""
Removes peaks with a fold change over the control less than fc.
"""
if float(feature[7]) >= fc:
return True
else:
return False
# Filter based on fold-change over control sample
fc_cutoff = 10
epithelial = epithelial.filter(filter_fold_change, fc = fc_cutoff).saveas()
proximal_tube = proximal_tube.filter(filter_fold_change, fc = fc_cutoff).saveas()
kidney = kidney.filter(filter_fold_change, fc = fc_cutoff).saveas()
# Identify only those sites that are peaks in all three tissue types
combined = pybedtools.BedTool().multi_intersect(i = [epithelial.fn, proximal_tube.fn, kidney.fn])
union = combined.filter(lambda x: int(x[3]) == 3).saveas()
union.cut(range(3)).saveas(data + "/sites-union.bed")
diff --git a/code/process.sh b/code/process.sh
new file mode 100755
index 0000000..71b00b9
--- /dev/null
+++ b/code/process.sh
...
#!/bin/bash
# Download bam files and call peaks.
ENCODE=https://www.encodeproject.org/files
OUTDIR=../data
MACS2="macs2 callpeak -f BAM -g hs"
################################################################################
echo "Processing kidney epithelial cell data"
# Kidney epithelial cells
# https://www.encodeproject.org/experiments/ENCSR000DVH/
# Rep 1 - ENCFF001HQY
# Rep 2 - ENCFF001HRF
# Control - ENCFF001HRJ
# Setup
SUBDIR=$OUTDIR/epithelial
mkdir -p $SUBDIR
# Download
# $ENCODE/$SAMPLE/@@download/$SAMPLE.bam
if [[ ! -e $SUBDIR/ENCFF001HQY.bam ]]
then
echo "Downloading data"
for SAMPLE in ENCFF001HQY ENCFF001HRF ENCFF001HRJ
do
wget -O $SUBDIR/$SAMPLE.bam $ENCODE/$SAMPLE/@@download/$SAMPLE.bam
done
fi
# Run MACS2
echo "Running MACS2"
$MACS2 -t $SUBDIR/ENCFF001HQY.bam $SUBDIR/ENCFF001HRF.bam -c $SUBDIR/ENCFF001HRJ.bam \
-n epithelial --outdir $SUBDIR
################################################################################
echo "Processing proximal tube data"
# Epithelial cell of proximal tube
# https://www.encodeproject.org/experiments/ENCSR000DXD/
# Rep 1 - ENCFF001HWB
# Rep 2 - ENCFF001HWC
# Control - ENCFF001HWN
# Setup
SUBDIR=$OUTDIR/proximal-tube
mkdir -p $SUBDIR
# Download
# $ENCODE/$SAMPLE/@@download/$SAMPLE.bam
if [[ ! -e $SUBDIR/ENCFF001HWB.bam ]]
then
echo "Downloading data"
for SAMPLE in ENCFF001HWB ENCFF001HWC ENCFF001HWN
do
wget -O $SUBDIR/$SAMPLE.bam $ENCODE/$SAMPLE/@@download/$SAMPLE.bam
done
fi
# Run MACS2
echo "Running MACS2"
$MACS2 -t $SUBDIR/ENCFF001HWB.bam $SUBDIR/ENCFF001HWC.bam -c $SUBDIR/ENCFF001HWN.bam \
-n proximal-tube --outdir $SUBDIR
################################################################################
echo "Processing kidney data"
# kidney
# https://www.encodeproject.org/experiments/ENCSR000DMC/
# Rep 1 - ENCFF000RWX
# Rep 2 - ENCFF000RWW
# Rep 3 - ENCFF000RWZ
# Control - ENCFF000RXD
# Setup
SUBDIR=$OUTDIR/kidney
mkdir -p $SUBDIR
# Download
# $ENCODE/$SAMPLE/@@download/$SAMPLE.bam
if [[ ! -e $SUBDIR/ENCFF000RWX.bam ]]
then
echo "Downloading data"
for SAMPLE in ENCFF000RWX ENCFF000RWW ENCFF000RWZ ENCFF000RXD
do
wget -O $SUBDIR/$SAMPLE.bam $ENCODE/$SAMPLE/@@download/$SAMPLE.bam
done
fi
# Run MACS2
echo "Running MACS2"
$MACS2 -t $SUBDIR/ENCFF000RWX.bam $SUBDIR/ENCFF000RWW.bam $SUBDIR/ENCFF000RWZ.bam \
-c $SUBDIR/ENCFF000RXD.bam \
-n kidney --outdir $SUBDIR