#!/usr/bin/env python3 # Load required modules from argparse import ArgumentParser import sys import os import random import pickle import shap import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error,r2_score from sklearn.inspection import permutation_importance # Parse arguments def define_options(): parser = ArgumentParser(description='Train a random forest classifier with hyperparameter tuning and variable importance estimation.') parser.add_argument('-c', '--chromatin', type=str, help='Chromatin data folder', default = None) parser.add_argument('-m', '--marks', type=str, help='Subset of marks of interest (TXT)', default = None) parser.add_argument('-e', '--expression', type=str, help='Expression data', default = None) parser.add_argument('-g', '--genes', type=str, help='Subset of genes of interest (BED)', default = None) parser.add_argument('-x', '--chrom_time', type=str, help='Chromatin (explanatory) time point', default = None) parser.add_argument('-y', '--expr_time', type=str, help='Expression (target) time point', default = None) parser.add_argument('-r', '--random', type=int, help='Seed for pseudo-random number generation [default=%(default)s]', default = 1) parser.add_argument('-v', '--cv', type=int, help='Cross-validation folds [default=%(default)s]', default = 10) parser.add_argument('-i', '--n_iter', type=int, help='Random hyperparameter search no. iterations [default=%(default)s]', default = 100) return parser parser = define_options() if len(sys.argv)==1: parser.print_help() sys.exit(1) args = parser.parse_args() # Output files output_shap = 'shap.txt' output_gini = 'gini.txt' output_perm = 'perm.txt' output_perf = 'perf.txt' output_x = 'x.txt' output_model= 'model.pickle' # Genes of interest sub = pd.read_csv(args.genes, sep = "\t", header = None)[3] sub = sub.str.replace("\..*", "") # Expression data expr = pd.read_csv(args.expression, sep = "\t") timepoints = expr.columns.tolist() # Chromatin data marks = pd.read_csv(args.marks, sep = "\t", header = None, squeeze = True) chrom = pd.read_csv("%s/%s.tsv" % (args.chromatin, marks[0]), sep = "\t") chrom.columns = [marks[0] + "_" + i for i in chrom.columns] for mark in marks[1:]: tmp = pd.read_csv("%s/%s.tsv" % (args.chromatin, mark), sep = "\t") tmp.columns = [mark + "_" + i for i in tmp.columns] chrom = chrom.join(tmp) # Subset and split # Genes X = chrom.loc[sub,] Y = expr.loc[sub,] # Timepoints X = X.filter(like = args.chrom_time, axis = 1) X.columns = X.columns.str.replace("_.*", "") y = Y[args.expr_time] y = y - y.mean() # Training/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=args.random) # Save X_test X_test.to_csv(output_x, sep = '\t') # Hyperparameter tuning (random search) # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} # First create the base model to tune rfc = RandomForestRegressor() # Random search of parameters, using 10 fold cross validation, # search across 100 different combinations, and use all available cores rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = args.n_iter, cv = args.cv, verbose=1, random_state=args.random, n_jobs = -1) # Fit the random search model rfc_random.fit(X_train, y_train) # Best model rfc_best = rfc_random.best_estimator_ pickle.dump(rfc_best, open(output_model, 'wb')) # Performance in the test set y_test_pred = rfc_best.predict(X_test) rmse = mean_squared_error(y_test, y_test_pred, squared=False) r = np.corrcoef(y_test, y_test_pred)[0,1] r2 = r2_score(y_test, y_test_pred) with open(output_perf, "w") as f: f.write(str(rmse) + "\t" + str(r) + "\t" + str(r2) + '\n') f.close() # Importance (Gini) imp_gini = rfc_best.feature_importances_ idx = imp_gini.argsort()[::-1] with open(output_gini, "w") as f: for feat,imp in zip(X.columns[idx],imp_gini[idx]): f.write(str(feat)+"\t"+str(imp)+"\n") f.close() # Importance (Permutation) imp_perm = permutation_importance(rfc_best, X_test, y_test, n_repeats=100, random_state=args.random, n_jobs=-1) idx = imp_perm.importances_mean.argsort()[::-1] with open(output_perm, "w") as f: for feat,imp in zip(X.columns[idx], imp_perm.importances_mean[idx]): f.write(str(feat)+"\t"+str(imp)+"\n") f.close() # Explain the model's predictions using SHAP explainer = shap.TreeExplainer(rfc_best) shap_values = explainer.shap_values(X_test) with open(output_shap, "w") as f: for a in shap_values: f.write('\t'.join([str(i) for i in a]) + '\n') f.close() # DONE