{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load required modules\n", "\n", "import pandas as pd\n", "import numpy as np\n", "\n", "from sklearn.model_selection import train_test_split, RandomizedSearchCV\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "from sklearn.metrics import accuracy_score, confusion_matrix\n", "from sklearn.inspection import permutation_importance\n", "\n", "import shap\n", "\n", "import os\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set working dir\n", "os.chdir('/users/rg/dgarrido/PhD/projects/side_tasks/chrom_bea/')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Gene subset of interest\n", "sub_f = \"data/subset/upregulated.bed\"\n", "sub = pd.read_csv(sub_f, sep = \"\\t\", header = None)[3]\n", "sub = sub.str.replace(\"\\..*\", \"\")\n", "sub" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Expression data\n", "expr_f = \"data/gene.expression.tsv\"\n", "expr = pd.read_csv(expr_f, sep = \"\\t\")\n", "timepoints = expr.columns.tolist()\n", "expr" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Chromatin data\n", "marks_f = \"data/marks.txt\"\n", "marks_dir = \"data/marks\"\n", "marks = pd.read_csv(marks_f, sep = \"\\t\", header = None, squeeze = True)\n", "\n", "chrom = pd.read_csv(\"%s/%s.tsv\" % (marks_dir, marks[0]), sep = \"\\t\")\n", "chrom.columns = [marks[0] + \"_\" + i for i in chrom.columns]\n", "\n", "for mark in marks[1:]:\n", " tmp = pd.read_csv(\"%s/%s.tsv\" % (marks_dir, mark), sep = \"\\t\")\n", " tmp.columns = [mark + \"_\" + i for i in tmp.columns]\n", " chrom = chrom.join(tmp)\n", "chrom" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data matrices\n", "X = chrom.loc[sub,]\n", "Y = expr.loc[sub,] \n", "\n", "tpselx = \"H006\"\n", "tpsely = \"H003\"\n", "\n", "X = X.filter(like = tpselx, axis = 1)\n", "X.columns = X.columns.str.replace(\"_.*\", \"\")\n", "y = Y[tpsely]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Chromatin: \" + tpselx + \"; Expression: \" + tpsely)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# training/test split\n", "repeat = 0\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=repeat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Hyperparameter tuning (random search)\n", "\n", "# Number of trees in random forest\n", "n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]\n", "# Number of features to consider at every split\n", "max_features = ['auto', 'sqrt']\n", "# Maximum number of levels in tree\n", "max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]\n", "max_depth.append(None)\n", "# Minimum number of samples required to split a node\n", "min_samples_split = [2, 5, 10]\n", "# Minimum number of samples required at each leaf node\n", "min_samples_leaf = [1, 2, 4]\n", "# Method of selecting samples for training each tree\n", "bootstrap = [True, False]# Create the random grid\n", "\n", "random_grid = {'n_estimators': n_estimators,\n", " 'max_features': max_features,\n", " 'max_depth': max_depth,\n", " 'min_samples_split': min_samples_split,\n", " 'min_samples_leaf': min_samples_leaf,\n", " 'bootstrap': bootstrap}\n", "\n", "# First create the base model to tune\n", "rfc = RandomForestRegressor()\n", "# Random search of parameters, using 10 fold cross validation, \n", "# search across 100 different combinations, and use all available cores\n", "rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=1, random_state=repeat, n_jobs = -1)# Fit the random search model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rfc_random.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rfc_random.best_params_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Accuracy in the test set\n", "rfc_best = rfc_random.best_estimator_\n", "y_test_pred = rfc_best.predict(X_test)\n", "print(np.corrcoef(y_test_pred, y_test))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Gini importance\n", "imp_gini = rfc_best.feature_importances_\n", "idx = imp_gini.argsort()[::-1]\n", "for feat,imp in zip(X.columns[idx],imp_gini[idx]):\n", " print(str(feat)+\"\\t\"+str(round(imp, 4)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Permutation importance\n", "# r = permutation_importance(rfc_best, X_test, y_test, n_repeats=30, random_state=0, n_jobs=-1)\n", "# idx = r.importances_mean.argsort()[::-1]\n", "# for feat,imp in zip(X.columns[idx], r.importances_mean[idx]):\n", "# print(str(feat)+\"\\t\"+str(round(imp, 4)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Explain the model's predictions using SHAP\n", "explainer = shap.TreeExplainer(rfc_best)\n", "shap_values = explainer.shap_values(X_test)\n", "for a in shap_values[0]:\n", " print('\\t'.join([str(i) for i in a]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plots\n", "# shap.summary_plot(shap_values, X_train, plot_type = \"bar\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# shap.summary_plot(shap_values, X_test, plot_type = \"dot\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }