Authorea

Alex Rubinsteyn sanity checking the CV notebook about 8 years ago

Commit id: d95f64a2dbb57d4b99de284bfb0024a6e2cc9bc7

deletions | additions

"cells": [ { "cell_type": "code", "execution_count": 1, 2, "metadata": { "collapsed": false },

"output_type": "stream", "text": [ "Using Theano backend.\n", "Using gpu device 0: GeForce GTX TITAN X (CNMeM "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is enabled deprecated and replaced with initial size: 75.0% axes.prop_cycle; please use the latter.\n", " warnings.warn(self.msg_depr % (key, alt_key))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Couldn't import dot_parser, loading of memory, cuDNN 5004)\n" dot files will not be possible.\n" ] } ],

}, { "cell_type": "code", "execution_count": 2, 3, "metadata": { "collapsed": true },

"source": [ "min_peptides_to_consider_allele = 50\n", "max_ic50 = 50000\n", "data_dir=\"/home/tim/sinai/data/mhc-binding/\"" "data_dir = \"/Users/iskander/code/mhcflurry/\"" ] }, { "cell_type": "code", "execution_count": 24, 4, "metadata": { "collapsed": false },

" [137654 rows x 8 columns], 'sequence')" ] }, "execution_count": 24, 4, "metadata": {}, "output_type": "execute_result" }

}, { "cell_type": "code", "execution_count": 25, 5, "metadata": { "collapsed": true false }, "outputs": [], "source": [

}, { "cell_type": "code", "execution_count": 62, 6, "metadata": { "collapsed": false },

"source": [ "alleles = [\n", " \"HLA-A0201\",\n", " # \"HLA-A0301\",\n", " # \"HLA-A0203\",\n", " # \"HLA-A2602\",\n", " # \"HLA-A2603\",\n", " # 'HLA-B7301',\n", "]\n", "#alleles = alleles[:1] + alleles[-1:]\n", "#alleles = [allele for allele in all_train_data if len(all_train_data[allele].Y) >= min_peptides_to_consider_allele]"

}, { "cell_type": "code", "execution_count": 28, 7, "metadata": { "collapsed": false },

{ "data": { "text/plain": [ "0.29094172040394206" "0.36749263596306014" ] }, "execution_count": 28, 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_train_data[alleles[0]].weights" "all_train_data[alleles[0]].weights.std()" ] }, {

}, { "cell_type": "code", "execution_count": 26, 8, "metadata": { "collapsed": false },

{ "data": { "text/plain": [ "{9}" "{8, 9, 10, 11, 12, 13, 14, 15}" ] }, "execution_count": 26, 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set(len(x) for x in all_train_data[alleles[0]].peptides)" all_train_data[alleles[0]].original_peptides)" ] }, { "cell_type": "code", "execution_count": 5, 9, "metadata": { "collapsed": false },

}, { "cell_type": "code", "execution_count": 63, 10, "metadata": { "collapsed": false },

"name": "stdout", "output_type": "stream", "text": [ "Training data: 6 1 / 106 alleles\n" ] } ],

}, { "cell_type": "code", "execution_count": 70, 11, "metadata": { "collapsed": false },

}, { "cell_type": "code", "execution_count": 66, 12, "metadata": { "collapsed": false },

"name": "stdout", "output_type": "stream", "text": [ "60 "48 models\n" ] }, {

"{'activation', 'dropout_probability', 'embedding_output_dim', 'layer_sizes'}" ] }, "execution_count": 66, 12, "metadata": {}, "output_type": "execute_result" }

"source": [ "dropout_probabilities = [0.0, 0.1, 0.5]\n", "\n", "embedding_output_dims = [4, 16, [16, 32, 64, 128]\n", "#embedding_output_dims = [4, 32]\n", "\n", "#layer_sizes = [[4], [8], [16], [64], [128]]\n",

"name": "stdout", "output_type": "stream", "text": [ " HLA-A0201 "Allele: HLA-A0201\n", "-- fold 0 [ 0 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [16]}\n", "test tau: 0.339114\n", "test auc: 0.748178\n", "test f1: 0.620713\n", "train tau: 0.548417\n", "train auc: 0.884989\n", "train f1: 0.748365\n", #1/3\n", "Training peptides: ['AAAATCALV' 'AAAKAAAAV' 'AAAWYLWEV' 'AADFPGIAR' 'AADKAAAAY' 'AADLTQIFEV'\n", " HLA-A0201 fold 0 [ 1 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [64]}\n", "test tau: 0.474557\n", "test auc: 0.848442\n", "test f1: 0.545770\n", "train tau: 0.547662\n", "train auc: 0.885808\n", "train f1: 0.738517\n", 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV'\n", " HLA-A0201 fold 0 [ 2 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [100]}\n", "test tau: 0.312977\n", "test auc: 0.730126\n", "test f1: 0.604546\n", "train tau: 0.544632\n", "train auc: 0.883024\n", "train f1: 0.745066\n", 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AAEQRRSTI'\n", " 'AAFEDLRLL' 'AAGAAVKGV' 'AAGLPAIFV' 'AAGLQDCTMLV']...\n", "Test peptides: ['AAAKTPVIV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV'\n", " 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV'\n", " 'AAAKTPVIVV' 'AAASSLLYK' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV'\n", " 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV']...\n", "-- # training samples = 21917\n", "-- # test samples = 10959\n", "-- X_index shape for CV fold: (21917, 9)\n", "Training Y min=0.000000, mean=0.335157, max=1.000000, sum=7345.631061, n=21917\n", "Training weights min=0.100000, mean=0.290961, max=1.000000, sum=6377.000000\n", " HLA-A0201 fold 0 [ 3 0 / 60] 48] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh','embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [128]}\n" [16], 'embedding_output_dim': 16}\n" ] } ], "source": [ "import sys\n", "\n", "cv_df = collections.defaultdict(list)\n", "start = time.time()\n", "#for (allele, data) in list(train_data.items())[:1]:\n", "for (allele, data) in train_data.items():\n", " print(\"Allele: %s\" % allele)\n", " data_df = allele_data_to_df(data)\n", " cv = sklearn.cross_validation.StratifiedKFold(log_to_ic50(data.Y) < 500, sklearn.cross_validation.LabelKFold(data.original_peptides, n_folds = 3)\n", " for (fold_num, (train_indices, test_indices)) in enumerate(cv):\n", " print(\"-- fold #%d/3\" % (fold_num + 1,))\n", " for impute in [False, True]:\n", [False]:\n", " train_df = data_df.iloc[train_indices]\n", " test_df = data_df.iloc[test_indices]\n", " print(\"Training peptides: %s...\" % (data.original_peptides[train_indices][:20],))\n", " \n", " print(\"Test peptides: %s...\" % (data.original_peptides[test_indices][:20],))\n", " print(\"-- # training samples = %d\" % (len(train_df),))\n", " print(\"-- # test samples = %d\" % (len(test_df),))\n", " sys.stdout.flush()\n", " if impute:\n", " full_train_allele_data = dict(train_data)\n", " full_train_allele_data[allele] = df_to_allele_data(train_df)\n",

" X_pretrain = train_imputed_dict[allele].X_index\n", " Y_pretrain = train_imputed_dict[allele].Y\n", " sample_weights_pretrain = train_imputed_dict[allele].weights\n", " \n", " # gotta shuffle the samples or else everything in a mini-batch is correlated\n", " pretrain_shuffle_indices = np.arange(len(X_pretrain))\n", " np.random.shuffle(pretrain_shuffle_indices)\n", " X_pretrain = X_pretrain[pretrain_shuffle_indices]\n", " Y_pretrain = Y_pretrain[pretrain_shuffle_indices]\n", " sample_weights_pretrain = sample_weights_pretrain[pretrain_shuffle_indices]\n", " \n", " \n", " else:\n", " X_pretrain = Y_pretrain = sample_weights_pretrain = None\n", "\n", " \n", " X_cv_fold = make_2d_array(train_df.X_index)\n", " print(\"-- X_index shape for CV fold: %s\" % (X_cv_fold.shape,))\n", " sys.stdout.flush()\n", " Y_cv_fold = train_df.Y\n", " assert len(X_cv_fold) == len(Y_cv_fold)\n", " weights_cv_fold = train_df.weights\n", " assert len(X_cv_fold) == len(weights_cv_fold)\n", " \n", " # shuffle all the samples!\n", " shuffle_indices = np.arange(len(X_cv_fold))\n", " np.random.shuffle(shuffle_indices)\n", " X_cv_fold = X_cv_fold[shuffle_indices]\n", " Y_cv_fold = Y_cv_fold[shuffle_indices]\n", " weights_cv_fold = weights_cv_fold[shuffle_indices]\n", " \n", " X_cv_fold_test = make_2d_array(test_df.X_index)\n", " Y_cv_fold_test = test_df.Y\n", " \n", " print(\"Training Y min=%f, mean=%f, max=%f, sum=%f, n=%d\" % (\n", " Y_cv_fold.min(),\n", " Y_cv_fold.mean(),\n", " Y_cv_fold.max(),\n", " Y_cv_fold.sum(),\n", " len(Y_cv_fold)))\n", " print(\"Training weights min=%f, mean=%f, max=%f, sum=%f\" % (\n", " weights_cv_fold.min(),\n", " weights_cv_fold.mean(),\n", " weights_cv_fold.max(),\n", " weights_cv_fold.sum()\n", " ))\n", " \n", " for (i, model_params) in enumerate(models_params_list):\n", " print(\"%10s fold %3d [%3d / %3d] train_size=%d test_size=%d impute=%s model=%s\" %\n", " (allele, fold_num, i, len(models_params_list), len(train_indices), len(test_indices), impute, model_params))\n", " sys.stdout.flush()\n", " model = mhcflurry.Class1BindingPredictor.from_hyperparameters(\n", " max_ic50=max_ic50,\n", " **model_params)\n", "\n", " fit_time = -time.time()\n", " model.fit(\n", " make_2d_array(train_df.X_index),\n", X_cv_fold,\n", " train_df.Y,\n", Y_cv_fold,\n", " sample_weights=train_df.weights,\n", sample_weights=weights_cv_fold,\n", " X_pretrain=X_pretrain,\n", " Y_pretrain=Y_pretrain,\n", " sample_weights_pretrain=sample_weights_pretrain,\n", " verbose=False\n", verbose=False,\n", " )\n", n_training_epochs=100)\n", " fit_time += time.time()\n", " predictions test_predictions = model.predict(make_2d_array(test_df.X_index))\n", model.predict(X_cv_fold_test)\n", " train_predictions = model.predict(make_2d_array(train_df.X_index))\n", model.predict(X_cv_fold)\n", " cv_df[\"allele\"].append(allele)\n", " cv_df[\"allele_size\"].append(len(data.Y))\n", cv_df[\"allele_size\"].append(Y_cv_fold)\n", " cv_df[\"train_size\"].append(len(train_indices))\n", cv_df[\"train_size\"].append(Y_cv_fold)\n", " cv_df[\"model_params\"].append(model_params)\n", " #cv_df[\"model\"].append(model)\n", " cv_df[\"impute\"].append(impute)\n", " cv_df[\"imputed_size\"].append(len(Y_pretrain) if Y_pretrain is not None else None)\n", " cv_df[\"fit_time\"].append(fit_time)\n", "\n", " for (param, param_value) in model_params.iteritems():\n", model_params.items():\n", " cv_df[param].append(param_value)\n", " for (key, value) in make_scores(test_df.Y, predictions, make_scores(Y_cv_fold_test, test_predictions, test_df.weights).items():\n", " cv_df[\"test_%s\" % key].append(value)\n", " print(\"test %s: %f\" % (key, value))\n", " for (key, value) in make_scores(train_df.Y, make_scores(Y_cv_fold, train_predictions, train_df.weights).items():\n", " cv_df[\"train_%s\" % key].append(value)\n", " print(\"train %s: %f\" % (key, value))\n", "\n",

}, { "cell_type": "code", "execution_count": 12, null, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "allele allele_size impute embedding_output_dim activation layer0_size\n", "HLA-A0201 32876 False 5 tanh 4 0.863 +/ 0.042\n", " 8 0.855 +/ 0.032\n", " 16 0.834 +/ 0.035\n", " 64 0.849 +/ 0.043\n", " 128 0.835 +/ 0.038\n", " 10 tanh 4 0.847 +/ 0.030\n", " 8 0.866 +/ 0.025\n", " 16 0.860 +/ 0.030\n", " 64 0.808 +/ 0.040\n", " 128 0.817 +/ 0.037\n", " 32 tanh 4 0.857 +/ 0.025\n", " 8 0.853 +/ 0.021\n", " 16 0.825 +/ 0.015\n", " 64 0.800 +/ 0.019\n", " 128 0.776 +/ 0.040\n", " 64 tanh 4 0.853 +/ 0.024\n", " 8 0.839 +/ 0.025\n", " 16 0.825 +/ 0.023\n", " 64 0.795 +/ 0.024\n", " 128 0.795 +/ 0.025\n", " 128 tanh 4 0.843 +/ 0.017\n", " 8 0.839 +/ 0.013\n", " 16 0.817 +/ 0.021\n", " 64 0.802 +/ 0.019\n", " 128 0.791 +/ 0.020\n", " True 5 tanh 4 0.860 +/ 0.052\n", " 8 0.850 +/ 0.039\n", " 16 0.837 +/ 0.022\n", " 64 0.831 +/ 0.024\n", " 128 0.780 +/ 0.075\n", " ... \n", "HLA-B7301 115 False 128 tanh 4 0.765 +/ 0.020\n", " 8 0.744 +/ 0.065\n", " 16 0.728 +/ 0.084\n", " 64 0.767 +/ 0.027\n", " 128 0.714 +/ 0.073\n", " True 5 tanh 4 0.746 +/ 0.105\n", " 8 0.712 +/ 0.088\n", " 16 0.774 +/ 0.113\n", " 64 0.708 +/ 0.175\n", " 128 0.727 +/ 0.129\n", " 10 tanh 4 0.710 +/ 0.043\n", " 8 0.704 +/ 0.106\n", " 16 0.739 +/ 0.053\n", " 64 0.802 +/ 0.028\n", " 128 0.751 +/ 0.055\n", " 32 tanh 4 0.735 +/ 0.072\n", " 8 0.854 +/ 0.019\n", " 16 0.774 +/ 0.045\n", " 64 0.803 +/ 0.038\n", " 128 0.727 +/ 0.164\n", " 64 tanh 4 0.806 +/ 0.021\n", " 8 0.824 +/ 0.036\n", " 16 0.750 +/ 0.070\n", " 64 0.796 +/ 0.147\n", " 128 0.816 +/ 0.055\n", " 128 tanh 4 0.783 +/ 0.078\n", " 8 0.834 +/ 0.041\n", " 16 0.807 +/ 0.072\n", " 64 0.796 +/ 0.047\n", " 128 0.785 +/ 0.014\n", "dtype: object\n" ] } ], [], "source": [ "group_columns = [\"allele\", \"allele_size\", \"impute\"]\n", "group_columns.extend(models_params_explored)\n",

], "metadata": { "kernelspec": { "display_name": "Python 2", 3", "language": "python", "name": "python2" "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "ipython3", "version": "2.7.11" "3.4.3" } }, "nbformat": 4,