Ino de Bruijn Start counting kmers for chimerism detection  almost 10 years ago

Commit id: c89407e09226d4aee5d04528cd21f50517e48fef

deletions | additions      

         

{  "metadata": {  "name": "",  "signature": "sha256:77258f57f1da947062880cb2958d8967b5b7a24e6b75358781855bf4f6a4caec"  },  "nbformat": 3,  "nbformat_minor": 0,  "worksheets": [  {  "cells": [  {  "cell_type": "code",  "collapsed": false,  "input": [  "from Bio import SeqIO\n",  "\n",  "with open(\"/media/milou/glob/projects/masmvali-partdeux/reassembly-filtered-reads/Sample_1ng_even/Mircea_07102013_selected_refs.fasta\") as ffile:\n",  " record_dict = SeqIO.to_dict(SeqIO.parse(ffile, 'fasta'))"  ],  "language": "python",  "metadata": {},  "outputs": [],  "prompt_number": 8  },  {  "cell_type": "code",  "collapsed": false,  "input": [  "!echo \"`cd /media/milou/glob/github/masm-vali/masmvali/ && git log -1`\""  ],  "language": "python",  "metadata": {},  "outputs": [  {  "output_type": "stream",  "stream": "stdout",  "text": [  "commit 5e4c182ff5f75d603319cffc11533b810840d2ed\r\n",  "Author: Ino de Bruijn \r\n",  "Date: Tue Jul 8 18:07:20 2014 +0200\r\n",  "\r\n",  " Add contig-metagenome-cover property for backward compatibility\r\n"  ]  }  ],  "prompt_number": 13  },  {  "cell_type": "code",  "collapsed": false,  "input": [  "import sys\n",  "import os\n",  "import re\n",  "sys.path.append('/media/milou/glob/github/masm-vali/masmvali/')\n",  "import assembly\n",  "reload(assembly)\n",  "import nucmer\n",  "reload(nucmer)\n",  "import refgenome\n",  "reload(refgenome)\n",  "import validation\n",  "reload(validation)\n",  "import metassemble\n",  "reload(metassemble)\n",  "import jellyfish\n",  "reload(jellyfish)\n",  "\n",  "import utils\n",  "reload(utils)\n",  "\n",  "import warnings\n",  "import numpy as np\n",  "import matplotlib as mpl\n",  "import matplotlib.pyplot as plt"  ],  "language": "python",  "metadata": {},  "outputs": [],  "prompt_number": 27  },  {  "cell_type": "code",  "collapsed": false,  "input": [  "jellyfish.Jellyfish(\"/media/milou/glob/projects/masmvali-partdeux/reassembly-filtered-reads/Sample_1ng_even/ref-jellyfish/Bacteroides_thetaiotaomicron_VPI-5482-19.jf\").count_kmer(\"GTCCCATGGAATCAATAGC\")"  ],  "language": "python",  "metadata": {},  "outputs": [  {  "metadata": {},  "output_type": "pyout",  "prompt_number": 26,  "text": [  "0"  ]  }  ],  "prompt_number": 26  },  {  "cell_type": "code",  "collapsed": false,  "input": [  "jfr = jellyfish.JellyfishRef(\"/media/milou/glob/projects/masmvali-partdeux/reassembly-filtered-reads/Sample_1ng_even/Mircea_07102013_selected_refs.fasta\", \n",  " \"/media/milou/glob/projects/masmvali-partdeux/reassembly-filtered-reads/Sample_1ng_even/ref-jellyfish/\")"  ],  "language": "python",  "metadata": {},  "outputs": [],  "prompt_number": 28  },  {  "cell_type": "code",  "collapsed": false,  "input": [  "jfr.get_refs_with_kmer_count_of_seq(record_dict[\"Acidobacterium_capsulatum_ATCC_51196\"].seq, 19)"  ],  "language": "python",  "metadata": {},  "outputs": [  {  "ename": "KeyboardInterrupt",  "evalue": "",  "output_type": "pyerr",  "traceback": [  "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",  "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mjfr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_refs_with_kmer_count_of_seq\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrecord_dict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"Acidobacterium_capsulatum_ATCC_51196\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m19\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",  "\u001b[1;32m/media/milou/glob/github/masm-vali/masmvali/jellyfish.py\u001b[0m in \u001b[0;36mget_refs_with_kmer_count_of_seq\u001b[1;34m(self, seq, kmer_size)\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 52\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_run_kmer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkmer_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 53\u001b[1;33m \u001b[0mrefs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mJellyfish\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount_kmer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 54\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mrefs\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",  "\u001b[1;32m/media/milou/glob/github/masm-vali/masmvali/jellyfish.py\u001b[0m in \u001b[0;36mcount_kmer\u001b[1;34m(self, seq)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mcount_kmer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mseq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m \u001b[0mawk_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mawk\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjellyfish\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"query\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"-s\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"/dev/fd/0\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_in\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\">seq\\n%s\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mseq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"{rv+=$2} END {print rv}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 15\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mawk_count\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstdout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",  "\u001b[1;32m/home/idb/virtual-environments/py27-env/local/lib/python2.7/site-packages/sh.pyc\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 767\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 768\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 769\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mRunningCommand\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcmd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcall_args\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstdin\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstdout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 770\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 771\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",  "\u001b[1;32m/home/idb/virtual-environments/py27-env/local/lib/python2.7/site-packages/sh.pyc\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, cmd, call_args, stdin, stdout, stderr)\u001b[0m\n\u001b[0;32m 328\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 329\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshould_wait\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 330\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 331\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 332\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",  "\u001b[1;32m/home/idb/virtual-environments/py27-env/local/lib/python2.7/site-packages/sh.pyc\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 332\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 333\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 334\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_handle_exit_code\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 335\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 336\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",  "\u001b[1;32m/home/idb/virtual-environments/py27-env/local/lib/python2.7/site-packages/sh.pyc\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1150\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexit_code\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1151\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlog\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"exit code not set, waiting on pid\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1152\u001b[1;33m \u001b[0mpid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexit_code\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwaitpid\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1153\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexit_code\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_handle_exit_code\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexit_code\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1154\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",  "\u001b[1;31mKeyboardInterrupt\u001b[0m: "  ]  }  ],  "prompt_number": 30  },  {  "cell_type": "code",  "collapsed": false,  "input": [  "a = set([\"ola\"])\n",  "import sh"  ],  "language": "python",  "metadata": {},  "outputs": [],  "prompt_number": 2  },  {  "cell_type": "code",  "collapsed": false,  "input": [  "b = set([\"hej\",'ola'] + list(a))"  ],  "language": "python",  "metadata": {},  "outputs": [],  "prompt_number": 65  },  {  "cell_type": "code",  "collapsed": false,  "input": [  "sh.awk(sh.echo(\"-e\", \"5 10\\n5 10\"), \"{rv+=$2} END {print rv}\")"  ],  "language": "python",  "metadata": {},  "outputs": [  {  "metadata": {},  "output_type": "pyout",  "prompt_number": 6,  "text": [  "20"  ]  }  ],  "prompt_number": 6  },  {  "cell_type": "code",  "collapsed": false,  "input": [],  "language": "python",  "metadata": {},  "outputs": []  }  ],  "metadata": {}  }  ]  }