John Blischak Create example analysis so that tutorial is more realistic.  over 8 years ago

Commit id: 45f891b666e33edf1b1c6e61f55a3cf2240d536c

deletions | additions      

         

#!/usr/bin/env Rscript  # Analyze peak size distribution across chromosomes.  data_dir <- "../data"  sites <- read.table(file.path(data_dir, "sites-union.bed"),  stringsAsFactors = FALSE)  colnames(sites) <- c("chr", "start", "end")  sites$chr <- factor(sites$chr, levels = paste0("chr", c(1:22, "X")))  sites$len <- sites$end - sites$start  sites_per_chr <- table(sites$chr)  pdf(file.path(data_dir, "sites-union.pdf"))  par(mfrow = c(3, 1))  hist(sites$len)  boxplot(len ~ chr, data = sites, las = 3)  barplot(sites_per_chr, las = 3)  dev.off()           

#!/usr/bin/env python  """  Identify high-quality CTCF binding sites.  1. Remove peaks with low fold change in enrichment over control.  2. Keep only those regions bound by CTCF in all three tissue types.  """  import pybedtools  import math  data = "../data"  # For details of MACS2 output format:  # https://github.com/taoliu/MACS#output-files  epithelial = pybedtools.BedTool(data + "/epithelial/epithelial_peaks.narrowPeak")  proximal_tube = pybedtools.BedTool(data + "/proximal-tube/proximal-tube_peaks.narrowPeak")  kidney = pybedtools.BedTool(data + "/kidney/kidney_peaks.narrowPeak")  def filter_fold_change(feature, fc = 1):  """  Removes peaks with a fold change over the control less than fc.  """  if float(feature[7]) >= fc:  return True  else:  return False  # Filter based on fold-change over control sample  fc_cutoff = 10  epithelial = epithelial.filter(filter_fold_change, fc = fc_cutoff).saveas()  proximal_tube = proximal_tube.filter(filter_fold_change, fc = fc_cutoff).saveas()  kidney = kidney.filter(filter_fold_change, fc = fc_cutoff).saveas()  # Identify only those sites that are peaks in all three tissue types  combined = pybedtools.BedTool().multi_intersect(i = [epithelial.fn, proximal_tube.fn, kidney.fn])  union = combined.filter(lambda x: int(x[3]) == 3).saveas()  union.cut(range(3)).saveas(data + "/sites-union.bed")           

#!/bin/bash  # Download bam files and call peaks.  ENCODE=https://www.encodeproject.org/files  OUTDIR=../data  MACS2="macs2 callpeak -f BAM -g hs"  ################################################################################  echo "Processing kidney epithelial cell data"  # Kidney epithelial cells  # https://www.encodeproject.org/experiments/ENCSR000DVH/  # Rep 1 - ENCFF001HQY  # Rep 2 - ENCFF001HRF  # Control - ENCFF001HRJ  # Setup  SUBDIR=$OUTDIR/epithelial  mkdir -p $SUBDIR  # Download  # $ENCODE/$SAMPLE/@@download/$SAMPLE.bam  if [[ ! -e $SUBDIR/ENCFF001HQY.bam ]]  then  echo "Downloading data"  for SAMPLE in ENCFF001HQY ENCFF001HRF ENCFF001HRJ  do  wget -O $SUBDIR/$SAMPLE.bam $ENCODE/$SAMPLE/@@download/$SAMPLE.bam  done  fi  # Run MACS2  echo "Running MACS2"  $MACS2 -t $SUBDIR/ENCFF001HQY.bam $SUBDIR/ENCFF001HRF.bam -c $SUBDIR/ENCFF001HRJ.bam \  -n epithelial --outdir $SUBDIR  ################################################################################  echo "Processing proximal tube data"  # Epithelial cell of proximal tube  # https://www.encodeproject.org/experiments/ENCSR000DXD/  # Rep 1 - ENCFF001HWB  # Rep 2 - ENCFF001HWC  # Control - ENCFF001HWN  # Setup  SUBDIR=$OUTDIR/proximal-tube  mkdir -p $SUBDIR  # Download  # $ENCODE/$SAMPLE/@@download/$SAMPLE.bam  if [[ ! -e $SUBDIR/ENCFF001HWB.bam ]]  then  echo "Downloading data"  for SAMPLE in ENCFF001HWB ENCFF001HWC ENCFF001HWN  do  wget -O $SUBDIR/$SAMPLE.bam $ENCODE/$SAMPLE/@@download/$SAMPLE.bam  done  fi  # Run MACS2  echo "Running MACS2"  $MACS2 -t $SUBDIR/ENCFF001HWB.bam $SUBDIR/ENCFF001HWC.bam -c $SUBDIR/ENCFF001HWN.bam \  -n proximal-tube --outdir $SUBDIR  ################################################################################  echo "Processing kidney data"  # kidney  # https://www.encodeproject.org/experiments/ENCSR000DMC/  # Rep 1 - ENCFF000RWX  # Rep 2 - ENCFF000RWW  # Rep 3 - ENCFF000RWZ  # Control - ENCFF000RXD  # Setup  SUBDIR=$OUTDIR/kidney  mkdir -p $SUBDIR  # Download  # $ENCODE/$SAMPLE/@@download/$SAMPLE.bam  if [[ ! -e $SUBDIR/ENCFF000RWX.bam ]]  then  echo "Downloading data"  for SAMPLE in ENCFF000RWX ENCFF000RWW ENCFF000RWZ ENCFF000RXD  do  wget -O $SUBDIR/$SAMPLE.bam $ENCODE/$SAMPLE/@@download/$SAMPLE.bam  done  fi  # Run MACS2  echo "Running MACS2"  $MACS2 -t $SUBDIR/ENCFF000RWX.bam $SUBDIR/ENCFF000RWW.bam $SUBDIR/ENCFF000RWZ.bam \  -c $SUBDIR/ENCFF000RXD.bam \  -n kidney --outdir $SUBDIR