{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load required modules\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.model_selection import train_test_split, RandomizedSearchCV\n",
    "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix\n",
    "from sklearn.inspection import permutation_importance\n",
    "\n",
    "import shap\n",
    "\n",
    "import os\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set working dir\n",
    "os.chdir('/users/rg/dgarrido/PhD/projects/side_tasks/chrom_bea/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gene subset of interest\n",
    "sub_f = \"data/subset/upregulated.bed\"\n",
    "sub = pd.read_csv(sub_f, sep = \"\\t\", header = None)[3]\n",
    "sub = sub.str.replace(\"\\..*\", \"\")\n",
    "sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Expression data\n",
    "expr_f = \"data/gene.expression.tsv\"\n",
    "expr = pd.read_csv(expr_f, sep = \"\\t\")\n",
    "timepoints = expr.columns.tolist()\n",
    "expr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Chromatin data\n",
    "marks_f = \"data/marks.txt\"\n",
    "marks_dir = \"data/marks\"\n",
    "marks = pd.read_csv(marks_f, sep = \"\\t\", header = None, squeeze = True)\n",
    "\n",
    "chrom = pd.read_csv(\"%s/%s.tsv\" % (marks_dir, marks[0]), sep = \"\\t\")\n",
    "chrom.columns = [marks[0] + \"_\" + i for i in chrom.columns]\n",
    "\n",
    "for mark in marks[1:]:\n",
    "    tmp =  pd.read_csv(\"%s/%s.tsv\" % (marks_dir, mark), sep = \"\\t\")\n",
    "    tmp.columns = [mark + \"_\" + i for i in tmp.columns]\n",
    "    chrom = chrom.join(tmp)\n",
    "chrom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data matrices\n",
    "X = chrom.loc[sub,]\n",
    "Y = expr.loc[sub,]  \n",
    "\n",
    "tpselx = \"H006\"\n",
    "tpsely = \"H003\"\n",
    "\n",
    "X = X.filter(like = tpselx, axis = 1)\n",
    "X.columns = X.columns.str.replace(\"_.*\", \"\")\n",
    "y = Y[tpsely]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Chromatin: \" + tpselx + \"; Expression: \" + tpsely)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# training/test split\n",
    "repeat = 0\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=repeat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Hyperparameter tuning (random search)\n",
    "\n",
    "# Number of trees in random forest\n",
    "n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]\n",
    "# Number of features to consider at every split\n",
    "max_features = ['auto', 'sqrt']\n",
    "# Maximum number of levels in tree\n",
    "max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]\n",
    "max_depth.append(None)\n",
    "# Minimum number of samples required to split a node\n",
    "min_samples_split = [2, 5, 10]\n",
    "# Minimum number of samples required at each leaf node\n",
    "min_samples_leaf = [1, 2, 4]\n",
    "# Method of selecting samples for training each tree\n",
    "bootstrap = [True, False]# Create the random grid\n",
    "\n",
    "random_grid = {'n_estimators': n_estimators,\n",
    "               'max_features': max_features,\n",
    "               'max_depth': max_depth,\n",
    "               'min_samples_split': min_samples_split,\n",
    "               'min_samples_leaf': min_samples_leaf,\n",
    "               'bootstrap': bootstrap}\n",
    "\n",
    "# First create the base model to tune\n",
    "rfc = RandomForestRegressor()\n",
    "# Random search of parameters, using 10 fold cross validation, \n",
    "# search across 100 different combinations, and use all available cores\n",
    "rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=1, random_state=repeat, n_jobs = -1)# Fit the random search model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rfc_random.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rfc_random.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Accuracy in the test set\n",
    "rfc_best = rfc_random.best_estimator_\n",
    "y_test_pred = rfc_best.predict(X_test)\n",
    "print(np.corrcoef(y_test_pred, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gini importance\n",
    "imp_gini = rfc_best.feature_importances_\n",
    "idx = imp_gini.argsort()[::-1]\n",
    "for feat,imp in zip(X.columns[idx],imp_gini[idx]):\n",
    "    print(str(feat)+\"\\t\"+str(round(imp, 4)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Permutation importance\n",
    "# r = permutation_importance(rfc_best, X_test, y_test, n_repeats=30, random_state=0, n_jobs=-1)\n",
    "# idx = r.importances_mean.argsort()[::-1]\n",
    "# for feat,imp in zip(X.columns[idx], r.importances_mean[idx]):\n",
    "#    print(str(feat)+\"\\t\"+str(round(imp, 4)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explain the model's predictions using SHAP\n",
    "explainer = shap.TreeExplainer(rfc_best)\n",
    "shap_values = explainer.shap_values(X_test)\n",
    "for a in shap_values[0]:\n",
    "    print('\\t'.join([str(i) for i in a]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plots\n",
    "# shap.summary_plot(shap_values, X_train, plot_type = \"bar\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# shap.summary_plot(shap_values, X_test, plot_type = \"dot\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}