deletions | additions
diff --git a/notebooks/cv.ipynb b/notebooks/cv.ipynb
index e80f3e8..41b7fc3 100644
--- a/notebooks/cv.ipynb
+++ b/notebooks/cv.ipynb
...
"cells": [
{
"cell_type": "code",
"execution_count":
1, 2,
"metadata": {
"collapsed": false
},
...
"output_type": "stream",
"text": [
"Using Theano backend.\n",
"Using gpu device 0: GeForce GTX TITAN X (CNMeM "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is
enabled deprecated and replaced with
initial size: 75.0% axes.prop_cycle; please use the latter.\n",
" warnings.warn(self.msg_depr % (key, alt_key))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Couldn't import dot_parser, loading of
memory, cuDNN 5004)\n" dot files will not be possible.\n"
]
}
],
...
},
{
"cell_type": "code",
"execution_count":
2, 3,
"metadata": {
"collapsed": true
},
...
"source": [
"min_peptides_to_consider_allele = 50\n",
"max_ic50 = 50000\n",
"data_dir=\"/home/tim/sinai/data/mhc-binding/\"" "data_dir = \"/Users/iskander/code/mhcflurry/\""
]
},
{
"cell_type": "code",
"execution_count":
24, 4,
"metadata": {
"collapsed": false
},
...
" [137654 rows x 8 columns], 'sequence')"
]
},
"execution_count":
24, 4,
"metadata": {},
"output_type": "execute_result"
}
...
},
{
"cell_type": "code",
"execution_count":
25, 5,
"metadata": {
"collapsed":
true false
},
"outputs": [],
"source": [
...
},
{
"cell_type": "code",
"execution_count":
62, 6,
"metadata": {
"collapsed": false
},
...
"source": [
"alleles = [\n",
" \"HLA-A0201\",\n",
"
# \"HLA-A0301\",\n",
"
# \"HLA-A0203\",\n",
"
# \"HLA-A2602\",\n",
"
# \"HLA-A2603\",\n",
"
# 'HLA-B7301',\n",
"]\n",
"#alleles = alleles[:1] + alleles[-1:]\n",
"#alleles = [allele for allele in all_train_data if len(all_train_data[allele].Y) >= min_peptides_to_consider_allele]"
...
},
{
"cell_type": "code",
"execution_count":
28, 7,
"metadata": {
"collapsed": false
},
...
{
"data": {
"text/plain": [
"0.29094172040394206" "0.36749263596306014"
]
},
"execution_count":
28, 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_train_data[alleles[0]].weights" "all_train_data[alleles[0]].weights.std()"
]
},
{
...
},
{
"cell_type": "code",
"execution_count":
26, 8,
"metadata": {
"collapsed": false
},
...
{
"data": {
"text/plain": [
"{9}" "{8, 9, 10, 11, 12, 13, 14, 15}"
]
},
"execution_count":
26, 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"set(len(x) for x in
all_train_data[alleles[0]].peptides)" all_train_data[alleles[0]].original_peptides)"
]
},
{
"cell_type": "code",
"execution_count":
5, 9,
"metadata": {
"collapsed": false
},
...
},
{
"cell_type": "code",
"execution_count":
63, 10,
"metadata": {
"collapsed": false
},
...
"name": "stdout",
"output_type": "stream",
"text": [
"Training data:
6 1 / 106 alleles\n"
]
}
],
...
},
{
"cell_type": "code",
"execution_count":
70, 11,
"metadata": {
"collapsed": false
},
...
},
{
"cell_type": "code",
"execution_count":
66, 12,
"metadata": {
"collapsed": false
},
...
"name": "stdout",
"output_type": "stream",
"text": [
"60 "48 models\n"
]
},
{
...
"{'activation', 'dropout_probability', 'embedding_output_dim', 'layer_sizes'}"
]
},
"execution_count":
66, 12,
"metadata": {},
"output_type": "execute_result"
}
...
"source": [
"dropout_probabilities = [0.0, 0.1, 0.5]\n",
"\n",
"embedding_output_dims =
[4, 16, [16, 32, 64, 128]\n",
"#embedding_output_dims = [4, 32]\n",
"\n",
"#layer_sizes = [[4], [8], [16], [64], [128]]\n",
...
"name": "stdout",
"output_type": "stream",
"text": [
" HLA-A0201 "Allele: HLA-A0201\n",
"-- fold
0 [ 0 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [16]}\n",
"test tau: 0.339114\n",
"test auc: 0.748178\n",
"test f1: 0.620713\n",
"train tau: 0.548417\n",
"train auc: 0.884989\n",
"train f1: 0.748365\n", #1/3\n",
"Training peptides: ['AAAATCALV' 'AAAKAAAAV' 'AAAWYLWEV' 'AADFPGIAR' 'AADKAAAAY' 'AADLTQIFEV'\n",
"
HLA-A0201 fold 0 [ 1 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [64]}\n",
"test tau: 0.474557\n",
"test auc: 0.848442\n",
"test f1: 0.545770\n",
"train tau: 0.547662\n",
"train auc: 0.885808\n",
"train f1: 0.738517\n", 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV'\n",
"
HLA-A0201 fold 0 [ 2 / 60] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh', 'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes': [100]}\n",
"test tau: 0.312977\n",
"test auc: 0.730126\n",
"test f1: 0.604546\n",
"train tau: 0.544632\n",
"train auc: 0.883024\n",
"train f1: 0.745066\n", 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AADLTQIFEV' 'AAEQRRSTI'\n",
" 'AAFEDLRLL' 'AAGAAVKGV' 'AAGLPAIFV' 'AAGLQDCTMLV']...\n",
"Test peptides: ['AAAKTPVIV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV'\n",
" 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV' 'AAAKTPVIVV'\n",
" 'AAAKTPVIVV' 'AAASSLLYK' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV'\n",
" 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV' 'AAASSTHRKV']...\n",
"-- # training samples = 21917\n",
"-- # test samples = 10959\n",
"-- X_index shape for CV fold: (21917, 9)\n",
"Training Y min=0.000000, mean=0.335157, max=1.000000, sum=7345.631061, n=21917\n",
"Training weights min=0.100000, mean=0.290961, max=1.000000, sum=6377.000000\n",
" HLA-A0201 fold 0 [
3 0 /
60] 48] train_size=21917 test_size=10959 impute=False model={'activation': 'tanh',
'embedding_output_dim': 4, 'dropout_probability': 0.0, 'layer_sizes':
[128]}\n" [16], 'embedding_output_dim': 16}\n"
]
}
],
"source": [
"import sys\n",
"\n",
"cv_df = collections.defaultdict(list)\n",
"start = time.time()\n",
"#for (allele, data) in list(train_data.items())[:1]:\n",
"for (allele, data) in train_data.items():\n",
" print(\"Allele: %s\" % allele)\n",
" data_df = allele_data_to_df(data)\n",
" cv =
sklearn.cross_validation.StratifiedKFold(log_to_ic50(data.Y) < 500, sklearn.cross_validation.LabelKFold(data.original_peptides, n_folds = 3)\n",
" for (fold_num, (train_indices, test_indices)) in enumerate(cv):\n",
"
print(\"-- fold #%d/3\" % (fold_num + 1,))\n",
" for impute in
[False, True]:\n", [False]:\n",
" train_df = data_df.iloc[train_indices]\n",
" test_df = data_df.iloc[test_indices]\n",
" print(\"Training peptides: %s...\" % (data.original_peptides[train_indices][:20],))\n",
" \n",
" print(\"Test peptides: %s...\" % (data.original_peptides[test_indices][:20],))\n",
" print(\"-- # training samples = %d\" % (len(train_df),))\n",
" print(\"-- # test samples = %d\" % (len(test_df),))\n",
" sys.stdout.flush()\n",
" if impute:\n",
" full_train_allele_data = dict(train_data)\n",
" full_train_allele_data[allele] = df_to_allele_data(train_df)\n",
...
" X_pretrain = train_imputed_dict[allele].X_index\n",
" Y_pretrain = train_imputed_dict[allele].Y\n",
" sample_weights_pretrain = train_imputed_dict[allele].weights\n",
" \n",
" # gotta shuffle the samples or else everything in a mini-batch is correlated\n",
" pretrain_shuffle_indices = np.arange(len(X_pretrain))\n",
" np.random.shuffle(pretrain_shuffle_indices)\n",
" X_pretrain = X_pretrain[pretrain_shuffle_indices]\n",
" Y_pretrain = Y_pretrain[pretrain_shuffle_indices]\n",
" sample_weights_pretrain = sample_weights_pretrain[pretrain_shuffle_indices]\n",
" \n",
" \n",
" else:\n",
" X_pretrain = Y_pretrain = sample_weights_pretrain = None\n",
"\n", " \n",
" X_cv_fold = make_2d_array(train_df.X_index)\n",
" print(\"-- X_index shape for CV fold: %s\" % (X_cv_fold.shape,))\n",
" sys.stdout.flush()\n",
" Y_cv_fold = train_df.Y\n",
" assert len(X_cv_fold) == len(Y_cv_fold)\n",
" weights_cv_fold = train_df.weights\n",
" assert len(X_cv_fold) == len(weights_cv_fold)\n",
" \n",
" # shuffle all the samples!\n",
" shuffle_indices = np.arange(len(X_cv_fold))\n",
" np.random.shuffle(shuffle_indices)\n",
" X_cv_fold = X_cv_fold[shuffle_indices]\n",
" Y_cv_fold = Y_cv_fold[shuffle_indices]\n",
" weights_cv_fold = weights_cv_fold[shuffle_indices]\n",
" \n",
" X_cv_fold_test = make_2d_array(test_df.X_index)\n",
" Y_cv_fold_test = test_df.Y\n",
" \n",
" print(\"Training Y min=%f, mean=%f, max=%f, sum=%f, n=%d\" % (\n",
" Y_cv_fold.min(),\n",
" Y_cv_fold.mean(),\n",
" Y_cv_fold.max(),\n",
" Y_cv_fold.sum(),\n",
" len(Y_cv_fold)))\n",
" print(\"Training weights min=%f, mean=%f, max=%f, sum=%f\" % (\n",
" weights_cv_fold.min(),\n",
" weights_cv_fold.mean(),\n",
" weights_cv_fold.max(),\n",
" weights_cv_fold.sum()\n",
" ))\n",
" \n",
" for (i, model_params) in enumerate(models_params_list):\n",
" print(\"%10s fold %3d [%3d / %3d] train_size=%d test_size=%d impute=%s model=%s\" %\n",
" (allele, fold_num, i, len(models_params_list), len(train_indices), len(test_indices), impute, model_params))\n",
" sys.stdout.flush()\n",
" model = mhcflurry.Class1BindingPredictor.from_hyperparameters(\n",
" max_ic50=max_ic50,\n",
" **model_params)\n",
"\n",
" fit_time = -time.time()\n",
" model.fit(\n",
"
make_2d_array(train_df.X_index),\n", X_cv_fold,\n",
"
train_df.Y,\n", Y_cv_fold,\n",
"
sample_weights=train_df.weights,\n", sample_weights=weights_cv_fold,\n",
" X_pretrain=X_pretrain,\n",
" Y_pretrain=Y_pretrain,\n",
" sample_weights_pretrain=sample_weights_pretrain,\n",
"
verbose=False\n", verbose=False,\n",
"
)\n", n_training_epochs=100)\n",
" fit_time += time.time()\n",
"
predictions test_predictions =
model.predict(make_2d_array(test_df.X_index))\n", model.predict(X_cv_fold_test)\n",
" train_predictions =
model.predict(make_2d_array(train_df.X_index))\n", model.predict(X_cv_fold)\n",
" cv_df[\"allele\"].append(allele)\n",
"
cv_df[\"allele_size\"].append(len(data.Y))\n", cv_df[\"allele_size\"].append(Y_cv_fold)\n",
"
cv_df[\"train_size\"].append(len(train_indices))\n", cv_df[\"train_size\"].append(Y_cv_fold)\n",
" cv_df[\"model_params\"].append(model_params)\n",
" #cv_df[\"model\"].append(model)\n",
" cv_df[\"impute\"].append(impute)\n",
" cv_df[\"imputed_size\"].append(len(Y_pretrain) if Y_pretrain is not None else None)\n",
" cv_df[\"fit_time\"].append(fit_time)\n",
"\n",
" for (param, param_value) in
model_params.iteritems():\n", model_params.items():\n",
" cv_df[param].append(param_value)\n",
" for (key, value) in
make_scores(test_df.Y, predictions, make_scores(Y_cv_fold_test, test_predictions, test_df.weights).items():\n",
" cv_df[\"test_%s\" % key].append(value)\n",
" print(\"test %s: %f\" % (key, value))\n",
" for (key, value) in
make_scores(train_df.Y, make_scores(Y_cv_fold, train_predictions, train_df.weights).items():\n",
" cv_df[\"train_%s\" % key].append(value)\n",
" print(\"train %s: %f\" % (key, value))\n",
"\n",
...
},
{
"cell_type": "code",
"execution_count":
12, null,
"metadata": {
"collapsed": false
},
"outputs":
[
{
"name": "stdout",
"output_type": "stream",
"text": [
"allele allele_size impute embedding_output_dim activation layer0_size\n",
"HLA-A0201 32876 False 5 tanh 4 0.863 +/ 0.042\n",
" 8 0.855 +/ 0.032\n",
" 16 0.834 +/ 0.035\n",
" 64 0.849 +/ 0.043\n",
" 128 0.835 +/ 0.038\n",
" 10 tanh 4 0.847 +/ 0.030\n",
" 8 0.866 +/ 0.025\n",
" 16 0.860 +/ 0.030\n",
" 64 0.808 +/ 0.040\n",
" 128 0.817 +/ 0.037\n",
" 32 tanh 4 0.857 +/ 0.025\n",
" 8 0.853 +/ 0.021\n",
" 16 0.825 +/ 0.015\n",
" 64 0.800 +/ 0.019\n",
" 128 0.776 +/ 0.040\n",
" 64 tanh 4 0.853 +/ 0.024\n",
" 8 0.839 +/ 0.025\n",
" 16 0.825 +/ 0.023\n",
" 64 0.795 +/ 0.024\n",
" 128 0.795 +/ 0.025\n",
" 128 tanh 4 0.843 +/ 0.017\n",
" 8 0.839 +/ 0.013\n",
" 16 0.817 +/ 0.021\n",
" 64 0.802 +/ 0.019\n",
" 128 0.791 +/ 0.020\n",
" True 5 tanh 4 0.860 +/ 0.052\n",
" 8 0.850 +/ 0.039\n",
" 16 0.837 +/ 0.022\n",
" 64 0.831 +/ 0.024\n",
" 128 0.780 +/ 0.075\n",
" ... \n",
"HLA-B7301 115 False 128 tanh 4 0.765 +/ 0.020\n",
" 8 0.744 +/ 0.065\n",
" 16 0.728 +/ 0.084\n",
" 64 0.767 +/ 0.027\n",
" 128 0.714 +/ 0.073\n",
" True 5 tanh 4 0.746 +/ 0.105\n",
" 8 0.712 +/ 0.088\n",
" 16 0.774 +/ 0.113\n",
" 64 0.708 +/ 0.175\n",
" 128 0.727 +/ 0.129\n",
" 10 tanh 4 0.710 +/ 0.043\n",
" 8 0.704 +/ 0.106\n",
" 16 0.739 +/ 0.053\n",
" 64 0.802 +/ 0.028\n",
" 128 0.751 +/ 0.055\n",
" 32 tanh 4 0.735 +/ 0.072\n",
" 8 0.854 +/ 0.019\n",
" 16 0.774 +/ 0.045\n",
" 64 0.803 +/ 0.038\n",
" 128 0.727 +/ 0.164\n",
" 64 tanh 4 0.806 +/ 0.021\n",
" 8 0.824 +/ 0.036\n",
" 16 0.750 +/ 0.070\n",
" 64 0.796 +/ 0.147\n",
" 128 0.816 +/ 0.055\n",
" 128 tanh 4 0.783 +/ 0.078\n",
" 8 0.834 +/ 0.041\n",
" 16 0.807 +/ 0.072\n",
" 64 0.796 +/ 0.047\n",
" 128 0.785 +/ 0.014\n",
"dtype: object\n"
]
}
], [],
"source": [
"group_columns = [\"allele\", \"allele_size\", \"impute\"]\n",
"group_columns.extend(models_params_explored)\n",
...
],
"metadata": {
"kernelspec": {
"display_name": "Python
2", 3",
"language": "python",
"name":
"python2" "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version":
2 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer":
"ipython2", "ipython3",
"version":
"2.7.11" "3.4.3"
}
},
"nbformat": 4,