Alex Rubinsteyn sanity checking the CV notebook  about 8 years ago

Commit id: d95f64a2dbb57d4b99de284bfb0024a6e2cc9bc7

deletions | additions      

       

"cells": [  {  "cell_type": "code",  "execution_count": 1, 2,  "metadata": {  "collapsed": false  }, 

"output_type": "stream",  "text": [  "Using Theano backend.\n",  "Using gpu device 0: GeForce GTX TITAN X (CNMeM "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle  is enabled deprecated and replaced  with initial size: 75.0% axes.prop_cycle; please use the latter.\n",  " warnings.warn(self.msg_depr % (key, alt_key))\n"  ]  },  {  "name": "stdout",  "output_type": "stream",  "text": [  "Couldn't import dot_parser, loading  of memory, cuDNN 5004)\n" dot files will not be possible.\n"  ]  }  ], 

},  {  "cell_type": "code",  "execution_count": 2, 3,  "metadata": {  "collapsed": true  }, 

"source": [  "min_peptides_to_consider_allele = 50\n",  "max_ic50 = 50000\n",  "data_dir=\"/home/tim/sinai/data/mhc-binding/\"" "data_dir = \"/Users/iskander/code/mhcflurry/\""  ]  },  {  "cell_type": "code",  "execution_count": 24, 4,  "metadata": {  "collapsed": false  }, 

" [137654 rows x 8 columns], 'sequence')"  ]  },  "execution_count": 24, 4,  "metadata": {},  "output_type": "execute_result"  } 

},  {  "cell_type": "code",  "execution_count": 25, 5,  "metadata": {  "collapsed": true false  },  "outputs": [],  "source": [ 

},  {  "cell_type": "code",  "execution_count": 62, 6,  "metadata": {  "collapsed": false  }, 

"source": [  "alleles = [\n",  " \"HLA-A0201\",\n",  " #  \"HLA-A0301\",\n", " #  \"HLA-A0203\",\n", " #  \"HLA-A2602\",\n", " #  \"HLA-A2603\",\n", " #  'HLA-B7301',\n", "]\n",  "#alleles = alleles[:1] + alleles[-1:]\n",  "#alleles = [allele for allele in all_train_data if len(all_train_data[allele].Y) >= min_peptides_to_consider_allele]" 

},  {  "cell_type": "code",  "execution_count": 28, 7,  "metadata": {  "collapsed": false  }, 

{  "data": {  "text/plain": [  "0.29094172040394206" "0.36749263596306014"  ]  },  "execution_count": 28, 7,  "metadata": {},  "output_type": "execute_result"  }  ],  "source": [  "all_train_data[alleles[0]].weights" "all_train_data[alleles[0]].weights.std()"  ]  },  { 

},  {  "cell_type": "code",  "execution_count": 26, 8,  "metadata": {  "collapsed": false  }, 

{  "data": {  "text/plain": [  "{9}" "{8, 9, 10, 11, 12, 13, 14, 15}"  ]  },  "execution_count": 26, 8,  "metadata": {},  "output_type": "execute_result"  }  ],  "source": [  "set(len(x) for x in all_train_data[alleles[0]].peptides)" all_train_data[alleles[0]].original_peptides)"  ]  },  {  "cell_type": "code",  "execution_count": 5, 9,  "metadata": {  "collapsed": false  }, 

},  {  "cell_type": "code",  "execution_count": 63, 10,  "metadata": {  "collapsed": false  }, 

"name": "stdout",  "output_type": "stream",  "text": [  "Training data: 6 1  / 106 alleles\n" ]  }  ], 

},  {  "cell_type": "code",  "execution_count": 70, 11,  "metadata": {  "collapsed": false  }, 

},  {  "cell_type": "code",  "execution_count": 66, 12,  "metadata": {  "collapsed": false  }, 

"name": "stdout",  "output_type": "stream",  "text": [  "60 "48  models\n" ]  },  { 

"{'activation', 'dropout_probability', 'embedding_output_dim', 'layer_sizes'}"  ]  },  "execution_count": 66, 12,  "metadata": {},  "output_type": "execute_result"  } 

"source": [  "dropout_probabilities = [0.0, 0.1, 0.5]\n",  "\n",  "embedding_output_dims = [4, 16, [16,  32, 64, 128]\n", "#embedding_output_dims = [4, 32]\n",  "\n",  "#layer_sizes = [[4], [8], [16], [64], [128]]\n", 

"name": "stdout",  "output_type": "stream",  "text": [  " HLA-A0201 "Allele: HLA-A0201\n",  "--  fold 0 [ 0 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [16]}\n",  "test tau: 0.339114\n",  "test auc: 0.748178\n",  "test f1: 0.620713\n",  "train tau: 0.548417\n",  "train auc: 0.884989\n",  "train f1: 0.748365\n", #1/3\n",  "Training peptides: ['AAAATCALV' 'AAAKAAAAV' 'AAAWYLWEV' 'AADFPGIAR' 'AADKAAAAY' 'AADLTQIFEV'\n",  " HLA-A0201 fold 0 [ 1 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [64]}\n",  "test tau: 0.474557\n",  "test auc: 0.848442\n",  "test f1: 0.545770\n",  "train tau: 0.547662\n",  "train auc: 0.885808\n",  "train f1: 0.738517\n", 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV'\n",  " HLA-A0201 fold 0 [ 2 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [100]}\n",  "test tau: 0.312977\n",  "test auc: 0.730126\n",  "test f1: 0.604546\n",  "train tau: 0.544632\n",  "train auc: 0.883024\n",  "train f1: 0.745066\n", 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AAEQRRSTI'\n",  " 'AAFEDLRLL' 'AAGAAVKGV' 'AAGLPAIFV' 'AAGLQDCTMLV']...\n",  "Test peptides: ['AAAKTPVIV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV'\n",  " 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV'\n",  " 'AAAKTPVIVV' 'AAASSLLYK' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV'\n",  " 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV']...\n",  "-- # training samples = 21917\n",  "-- # test samples = 10959\n",  "-- X_index shape for CV fold: (21917, 9)\n",  "Training Y min=0.000000, mean=0.335157, max=1.000000, sum=7345.631061, n=21917\n",  "Training weights min=0.100000, mean=0.290961, max=1.000000, sum=6377.000000\n",  " HLA-A0201 fold 0 [ 3 0  / 60] 48]  train_size=21917 test_size=10959 impute=False model={'activation': 'tanh','embedding_output_dim': 4,  'dropout_probability': 0.0, 'layer_sizes': [128]}\n" [16], 'embedding_output_dim': 16}\n"  ]  }  ],  "source": [  "import sys\n",  "\n",  "cv_df = collections.defaultdict(list)\n",  "start = time.time()\n",  "#for (allele, data) in list(train_data.items())[:1]:\n",  "for (allele, data) in train_data.items():\n",  " print(\"Allele: %s\" % allele)\n",  " data_df = allele_data_to_df(data)\n",  " cv = sklearn.cross_validation.StratifiedKFold(log_to_ic50(data.Y) < 500, sklearn.cross_validation.LabelKFold(data.original_peptides,  n_folds = 3)\n", " for (fold_num, (train_indices, test_indices)) in enumerate(cv):\n",  " print(\"-- fold #%d/3\" % (fold_num + 1,))\n",  "  for impute in [False, True]:\n", [False]:\n",  " train_df = data_df.iloc[train_indices]\n",  " test_df = data_df.iloc[test_indices]\n",  " print(\"Training peptides: %s...\" % (data.original_peptides[train_indices][:20],))\n",  " \n",  " print(\"Test peptides: %s...\" % (data.original_peptides[test_indices][:20],))\n",  " print(\"-- # training samples = %d\" % (len(train_df),))\n",  " print(\"-- # test samples = %d\" % (len(test_df),))\n",  " sys.stdout.flush()\n",  " if impute:\n",  " full_train_allele_data = dict(train_data)\n",  " full_train_allele_data[allele] = df_to_allele_data(train_df)\n", 

" X_pretrain = train_imputed_dict[allele].X_index\n",  " Y_pretrain = train_imputed_dict[allele].Y\n",  " sample_weights_pretrain = train_imputed_dict[allele].weights\n",  " \n",  " # gotta shuffle the samples or else everything in a mini-batch is correlated\n",  " pretrain_shuffle_indices = np.arange(len(X_pretrain))\n",  " np.random.shuffle(pretrain_shuffle_indices)\n",  " X_pretrain = X_pretrain[pretrain_shuffle_indices]\n",  " Y_pretrain = Y_pretrain[pretrain_shuffle_indices]\n",  " sample_weights_pretrain = sample_weights_pretrain[pretrain_shuffle_indices]\n",  " \n",  " \n",  " else:\n",  " X_pretrain = Y_pretrain = sample_weights_pretrain = None\n",  "\n", " \n",  " X_cv_fold = make_2d_array(train_df.X_index)\n",  " print(\"-- X_index shape for CV fold: %s\" % (X_cv_fold.shape,))\n",  " sys.stdout.flush()\n",  " Y_cv_fold = train_df.Y\n",  " assert len(X_cv_fold) == len(Y_cv_fold)\n",  " weights_cv_fold = train_df.weights\n",  " assert len(X_cv_fold) == len(weights_cv_fold)\n",  " \n",  " # shuffle all the samples!\n",  " shuffle_indices = np.arange(len(X_cv_fold))\n",  " np.random.shuffle(shuffle_indices)\n",  " X_cv_fold = X_cv_fold[shuffle_indices]\n",  " Y_cv_fold = Y_cv_fold[shuffle_indices]\n",  " weights_cv_fold = weights_cv_fold[shuffle_indices]\n",  " \n",  " X_cv_fold_test = make_2d_array(test_df.X_index)\n",  " Y_cv_fold_test = test_df.Y\n",  " \n",  " print(\"Training Y min=%f, mean=%f, max=%f, sum=%f, n=%d\" % (\n",  " Y_cv_fold.min(),\n",  " Y_cv_fold.mean(),\n",  " Y_cv_fold.max(),\n",  " Y_cv_fold.sum(),\n",  " len(Y_cv_fold)))\n",  " print(\"Training weights min=%f, mean=%f, max=%f, sum=%f\" % (\n",  " weights_cv_fold.min(),\n",  " weights_cv_fold.mean(),\n",  " weights_cv_fold.max(),\n",  " weights_cv_fold.sum()\n",  " ))\n",  " \n",  " for (i, model_params) in enumerate(models_params_list):\n",  " print(\"%10s fold %3d [%3d / %3d] train_size=%d test_size=%d impute=%s model=%s\" %\n",  " (allele, fold_num, i, len(models_params_list), len(train_indices), len(test_indices), impute, model_params))\n",  " sys.stdout.flush()\n",  " model = mhcflurry.Class1BindingPredictor.from_hyperparameters(\n",  " max_ic50=max_ic50,\n",  " **model_params)\n",  "\n",  " fit_time = -time.time()\n",  " model.fit(\n",  " make_2d_array(train_df.X_index),\n", X_cv_fold,\n",  " train_df.Y,\n", Y_cv_fold,\n",  " sample_weights=train_df.weights,\n", sample_weights=weights_cv_fold,\n",  " X_pretrain=X_pretrain,\n",  " Y_pretrain=Y_pretrain,\n",  " sample_weights_pretrain=sample_weights_pretrain,\n",  " verbose=False\n", verbose=False,\n",  " )\n", n_training_epochs=100)\n",  " fit_time += time.time()\n",  " predictions test_predictions  = model.predict(make_2d_array(test_df.X_index))\n", model.predict(X_cv_fold_test)\n",  " train_predictions = model.predict(make_2d_array(train_df.X_index))\n", model.predict(X_cv_fold)\n",  " cv_df[\"allele\"].append(allele)\n",  " cv_df[\"allele_size\"].append(len(data.Y))\n", cv_df[\"allele_size\"].append(Y_cv_fold)\n",  " cv_df[\"train_size\"].append(len(train_indices))\n", cv_df[\"train_size\"].append(Y_cv_fold)\n",  " cv_df[\"model_params\"].append(model_params)\n",  " #cv_df[\"model\"].append(model)\n",  " cv_df[\"impute\"].append(impute)\n",  " cv_df[\"imputed_size\"].append(len(Y_pretrain) if Y_pretrain is not None else None)\n",  " cv_df[\"fit_time\"].append(fit_time)\n",  "\n",  " for (param, param_value) in model_params.iteritems():\n", model_params.items():\n",  " cv_df[param].append(param_value)\n",  " for (key, value) in make_scores(test_df.Y, predictions, make_scores(Y_cv_fold_test, test_predictions,  test_df.weights).items():\n", " cv_df[\"test_%s\" % key].append(value)\n",  " print(\"test %s: %f\" % (key, value))\n",  " for (key, value) in make_scores(train_df.Y, make_scores(Y_cv_fold,  train_predictions, train_df.weights).items():\n", " cv_df[\"train_%s\" % key].append(value)\n",  " print(\"train %s: %f\" % (key, value))\n",  "\n", 

},  {  "cell_type": "code",  "execution_count": 12, null,  "metadata": {  "collapsed": false  },  "outputs": [  {  "name": "stdout",  "output_type": "stream",  "text": [  "allele allele_size impute embedding_output_dim activation layer0_size\n",  "HLA-A0201 32876 False 5 tanh 4 0.863 +/ 0.042\n",  " 8 0.855 +/ 0.032\n",  " 16 0.834 +/ 0.035\n",  " 64 0.849 +/ 0.043\n",  " 128 0.835 +/ 0.038\n",  " 10 tanh 4 0.847 +/ 0.030\n",  " 8 0.866 +/ 0.025\n",  " 16 0.860 +/ 0.030\n",  " 64 0.808 +/ 0.040\n",  " 128 0.817 +/ 0.037\n",  " 32 tanh 4 0.857 +/ 0.025\n",  " 8 0.853 +/ 0.021\n",  " 16 0.825 +/ 0.015\n",  " 64 0.800 +/ 0.019\n",  " 128 0.776 +/ 0.040\n",  " 64 tanh 4 0.853 +/ 0.024\n",  " 8 0.839 +/ 0.025\n",  " 16 0.825 +/ 0.023\n",  " 64 0.795 +/ 0.024\n",  " 128 0.795 +/ 0.025\n",  " 128 tanh 4 0.843 +/ 0.017\n",  " 8 0.839 +/ 0.013\n",  " 16 0.817 +/ 0.021\n",  " 64 0.802 +/ 0.019\n",  " 128 0.791 +/ 0.020\n",  " True 5 tanh 4 0.860 +/ 0.052\n",  " 8 0.850 +/ 0.039\n",  " 16 0.837 +/ 0.022\n",  " 64 0.831 +/ 0.024\n",  " 128 0.780 +/ 0.075\n",  " ... \n",  "HLA-B7301 115 False 128 tanh 4 0.765 +/ 0.020\n",  " 8 0.744 +/ 0.065\n",  " 16 0.728 +/ 0.084\n",  " 64 0.767 +/ 0.027\n",  " 128 0.714 +/ 0.073\n",  " True 5 tanh 4 0.746 +/ 0.105\n",  " 8 0.712 +/ 0.088\n",  " 16 0.774 +/ 0.113\n",  " 64 0.708 +/ 0.175\n",  " 128 0.727 +/ 0.129\n",  " 10 tanh 4 0.710 +/ 0.043\n",  " 8 0.704 +/ 0.106\n",  " 16 0.739 +/ 0.053\n",  " 64 0.802 +/ 0.028\n",  " 128 0.751 +/ 0.055\n",  " 32 tanh 4 0.735 +/ 0.072\n",  " 8 0.854 +/ 0.019\n",  " 16 0.774 +/ 0.045\n",  " 64 0.803 +/ 0.038\n",  " 128 0.727 +/ 0.164\n",  " 64 tanh 4 0.806 +/ 0.021\n",  " 8 0.824 +/ 0.036\n",  " 16 0.750 +/ 0.070\n",  " 64 0.796 +/ 0.147\n",  " 128 0.816 +/ 0.055\n",  " 128 tanh 4 0.783 +/ 0.078\n",  " 8 0.834 +/ 0.041\n",  " 16 0.807 +/ 0.072\n",  " 64 0.796 +/ 0.047\n",  " 128 0.785 +/ 0.014\n",  "dtype: object\n"  ]  }  ], [],  "source": [  "group_columns = [\"allele\", \"allele_size\", \"impute\"]\n",  "group_columns.extend(models_params_explored)\n", 

],  "metadata": {  "kernelspec": {  "display_name": "Python 2", 3",  "language": "python",  "name": "python2" "python3"  },  "language_info": {  "codemirror_mode": {  "name": "ipython",  "version": 2 3  },  "file_extension": ".py",  "mimetype": "text/x-python",  "name": "python",  "nbconvert_exporter": "python",  "pygments_lexer": "ipython2", "ipython3",  "version": "2.7.11" "3.4.3"  }  },  "nbformat": 4,