# load R libraries library("sleuth") library('cowplot') library("RSQLite") library("dbplyr") library("dplyr") library("tidyr") library("tximportData") library("rhdf5") library("knitr") library("rmarkdown") new_position_theme <- theme(legend.position = c(0.80, 0.90)) #================================================================================================== # Import GENCODE V24 annotation #================================================================================================== # transcript quantifications are generated with kallisto using GENCODE V24 annotation # gene and transcript ids can be obtained from the kallisto's quantification file for any of the samples # read in file sample_id <- dir(file.path("/data/projects/roni_rnaseq_de_august2018/", "data_quant")) itsv <- file.path("/data/projects/roni_rnaseq_de_august2018", "data_quant", sample_id[1], "quantifications/kallisto/hg38_mmtv/paired_end/abundance.tsv") target_mapping <- read.delim(itsv) # extract information from target_id target_mapping$target_id_raw <- target_mapping$target_id target_id_names <- c('transcript_id', 'gene_id', 'ott1', 'ott2', 'transcript_name', 'gene_name', 'transcript_length', 'gene_type', 'empty') target_mapping <- target_mapping %>% separate(target_id_raw, target_id_names, "\\|") # select specific columns target_mapping <- target_mapping[, c("target_id", "gene_id", 'gene_name')] kal_dirs <- file.path("/data/projects/roni_rnaseq_de_august2018", "data_quant", sample_id, "quantifications/kallisto/hg38_mmtv/paired_end/") kal_dirs s2c <- read.table(file.path("/data/projects/roni_rnaseq_de_august2018", "metadata", "hiseq_info.txt"), header = TRUE, stringsAsFactors=FALSE) s2c <- dplyr::select(s2c, sample = sample, condition) s2c <- dplyr::mutate(s2c, path = kal_dirs) print(s2c) #Full table so <- sleuth_prep(s2c, ~ condition, target_mapping=target_mapping, extra_bootstrap_summary = TRUE) png("/data/projects/roni_rnaseq_de_august2018/figures/pca_tpm_all_samples.png", res = 150, w = 1500, h = 500) par(mfrow = c(1, 2)) p1 <- plot_pca(so, pc_x = 1L, pc_y = 2L, units = 'tpm', color_by = 'condition') + new_position_theme p2 <- plot_pc_variance(so, units = 'tpm', PC_relative = TRUE) plot_grid(p1, p2, align = "h") dev.off() png("/data/projects/roni_rnaseq_de_august2018/figures/group_density.png", res = 150, w = 1500, h = 500) plot_group_density(so, use_filtered = TRUE, units = "est_counts", trans = "log", grouping = setdiff(colnames(so$sample_to_covariates), "sample"), offset = 1) dev.off() #First comparison 2D_wt vs 2D_shx condA <- which(s2c$condition == "2D_wt") condB <- which(s2c$condition == "2D_sh") s2c_AvsB <- s2c[c(condA,condB),] print(s2c_AvsB) #2D_wt vs 2D_shx #Aggregate transcripts into genes so <- sleuth_prep(s2c_AvsB, ~ condition, target_mapping=target_mapping, aggregation_column = 'gene_name', extra_bootstrap_summary = TRUE) #Not aggregating transcripts #so <- sleuth_prep(s2c_AvsB, ~ condition, target_mapping=target_mapping, extra_bootstrap_summary = TRUE) #PCA PLOT with 2D samples png("/data/projects/roni_rnaseq_de_august2018/figures/pca_tpm_2D.png", res = 150, w = 1500, h = 500) par(mfrow = c(1, 2)) p1 <- plot_pca(so, pc_x = 1L, pc_y = 2L, units = 'tpm', color_by = 'condition') + new_position_theme p2 <- plot_pc_variance(so, units = 'tpm', PC_relative = TRUE) plot_grid(p1, p2, align = "h") dev.off() so <- sleuth_fit(so, ~condition, 'full') so <- sleuth_fit(so, ~1, 'reduced') # LTR test (to obtain the q-values): so <- sleuth_lrt(so, 'reduced', 'full') models(so) # WT test (to obtain the beta values). NOT WORKING because we aggregate at gene level so <- sleuth_wt(so, 'condition2D_wt') res_lrt <- sleuth_results(so, 'reduced:full', test_type = 'lrt') res_wt <- sleuth_results(so, 'condition2D_wt', 'wt',"full", show_all = TRUE) #res <- merge(res_lrt, res_wt[, c('target_id', 'b', 'se_b', 'mean_obs')], on = 'target_id', sort = FALSE) sleuth_table_lrt <- sleuth_results(so, 'reduced:full', 'lrt') sleuth_table_wt <- sleuth_results(so, 'condition2D_wt', 'wt') sleuth_significant <- dplyr::filter(sleuth_table_lrt, qval <= 0.05) sleuth_significant_wt <- dplyr::filter(sleuth_table_wt, qval <= 0.05) #Number of DE genes nrow(sleuth_significant) nrow(sleuth_significant_wt) #Add information on up/down regulation. Calculating means from samples. temphigh<-s2c_AvsB$sample[s2c_AvsB$condition=="2D_wt"] templow<-s2c_AvsB$sample[s2c_AvsB$condition=="2D_sh"] hiraw<-subset(so$obs_raw,so$obs_raw$sample %in% temphigh) lowraw<-subset(so$obs_raw,so$obs_raw$sample %in% templow) highmeans<-aggregate(tpm~target_id,data=hiraw,FUN=function(x) c(mean=mean(x))) colnames(highmeans)<-c("target_id","tpm_wt") lowmeans<-aggregate(tpm~target_id,data=lowraw,FUN=function(x) c(mean=mean(x))) colnames(lowmeans)<-c("target_id","tpm_sh") merged_means<-merge(lowmeans,highmeans,by=c("target_id")) merged_means2<-merge(merged_means,target_mapping,by=c("target_id")) sleuth_table_wTPM<-left_join(sleuth_table_lrt, merged_means2, by=c("gene_id")) wt<-aggregate(tpm_wt ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE) sh<-aggregate(tpm_sh ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE) sleuth_table_wTPM$tpm_sh<-NULL sleuth_table_wTPM$tpm_wt<-NULL sleuth_table_wTPM$target_id.y<-NULL sleuth_table_wTPM<-sleuth_table_wTPM[!duplicated(sleuth_table_wTPM), ] #Remove duplicates sleuth_table_wTPM<-left_join(sleuth_table_wTPM, wt, by=c("gene_id")) sleuth_table_wTPM<-left_join(sleuth_table_wTPM, sh, by=c("gene_id")) sleuth_table_wTPM$tpm_wt_by_sh=log2((sleuth_table_wTPM$tpm_wt+0.1)/(sleuth_table_wTPM$tpm_sh+0.1)) sleuth_significant <- dplyr::filter(sleuth_table_wTPM, qval <= 0.05) sleuth_significant$target_id.x<-NULL write.csv(sleuth_significant, file = "/data/projects/roni_rnaseq_de_august2018/tables/significant2D_wt_vs_sh.csv", quote=FALSE,row.names=FALSE) write.csv(sleuth_table_wTPM, file = "/data/projects/roni_rnaseq_de_august2018/tables/sleuth_table_wTPM_2D_wt_vs_sh.csv", quote=FALSE,row.names=FALSE) sleuth_table_wTPM$threshold <- as.factor(ifelse(sleuth_table_wTPM$qval < 0.05 & abs(log2(sleuth_table_wTPM$tpm_wt_by_sh)) >=1.5,ifelse(log2(sleuth_table_wTPM$tpm_wt_by_sh) > 1.5 ,'Up','Down'),'Not')) #Volcano plot png("/data/projects/roni_rnaseq_de_august2018/figures/volcano_2D.png", res = 150, w = 1500, h = 500) ggplot(data=sleuth_table_wTPM, aes(x=log2(tpm_wt_by_sh), y =-log10(qval), colour=threshold,fill=threshold)) + scale_color_manual(values=c("blue", "grey","red"))+ geom_point(alpha=0.4, size=1.2) + xlim(c(-4, 4)) + theme_bw(base_size = 12, base_family = "Times") + geom_vline(xintercept=c(-1.5,1.5),lty=4,col="grey",lwd=0.6)+ geom_hline(yintercept = -log10(0.05),lty=4,col="grey",lwd=0.6)+ theme(legend.position="right", panel.grid=element_blank(), legend.title = element_blank(), legend.text= element_text(face="bold", color="black",family = "Times", size=8), plot.title = element_text(hjust = 0.5), axis.text.x = element_text(face="bold", color="black", size=12), axis.text.y = element_text(face="bold", color="black", size=12), axis.title.x = element_text(face="bold", color="black", size=12), axis.title.y = element_text(face="bold",color="black", size=12))+ labs(x="log2 (fold change)",y="-log10 (q-value)",title="Volcano picture of DEG") dev.off() #Second comparison 3D_wt vs 3D_sh condA <- which(s2c$condition == "3D_wt") condB <- which(s2c$condition == "3D_sh") s2c_AvsB <- s2c[c(condA,condB),] print(s2c_AvsB) #3D_wt vs 3D_sh so <- sleuth_prep(s2c_AvsB, ~ condition, target_mapping=target_mapping, aggregation_column = 'gene_name', extra_bootstrap_summary = TRUE) #Plot PCA with only 3D samples png("/data/projects/roni_rnaseq_de_august2018/figures/pca_tpm_3D.png", res = 150, w = 1500, h = 500) par(mfrow = c(1, 2)) p1 <- plot_pca(so, pc_x = 1L, pc_y = 2L, units = 'tpm', color_by = 'condition') + new_position_theme p2 <- plot_pc_variance(so, units = 'tpm', PC_relative = TRUE) plot_grid(p1, p2, align = "h") dev.off() so <- sleuth_fit(so, ~condition, 'full') so <- sleuth_fit(so, ~1, 'reduced') # LTR test (to obtain the q-values): so <- sleuth_lrt(so, 'reduced', 'full') models(so) # WT test (to obtain the beta values). NOT WORKING because we aggregate at gene level so <- sleuth_wt(so, 'condition3D_wt') res_lrt <- sleuth_results(so, 'reduced:full', test_type = 'lrt') res_wt <- sleuth_results(so, 'condition3D_wt', 'wt',"full", show_all = TRUE) #res <- merge(res_lrt, res_wt[, c('target_id', 'b', 'se_b', 'mean_obs')], on = 'target_id', sort = FALSE) sleuth_table_lrt <- sleuth_results(so, 'reduced:full', 'lrt') sleuth_table_wt <- sleuth_results(so, 'condition3D_wt', 'wt') sleuth_significant <- dplyr::filter(sleuth_table_lrt, qval <= 0.05) #Number of DE genes nrow(sleuth_significant) #Add information on up/down regulation. Calculating means from samples. temphigh<-s2c_AvsB$sample[s2c_AvsB$condition=="3D_wt"] templow<-s2c_AvsB$sample[s2c_AvsB$condition=="3D_sh"] hiraw<-subset(so$obs_raw,so$obs_raw$sample %in% temphigh) lowraw<-subset(so$obs_raw,so$obs_raw$sample %in% templow) highmeans<-aggregate(tpm~target_id,data=hiraw,FUN=function(x) c(mean=mean(x))) colnames(highmeans)<-c("target_id","tpm_wt") lowmeans<-aggregate(tpm~target_id,data=lowraw,FUN=function(x) c(mean=mean(x))) colnames(lowmeans)<-c("target_id","tpm_sh") merged_means<-merge(lowmeans,highmeans,by=c("target_id")) merged_means2<-merge(merged_means,target_mapping,by=c("target_id")) sleuth_table_wTPM<-left_join(sleuth_table_lrt, merged_means2, by=c("gene_id")) wt<-aggregate(tpm_wt ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE) sh<-aggregate(tpm_sh ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE) sleuth_table_wTPM$tpm_sh<-NULL sleuth_table_wTPM$tpm_wt<-NULL sleuth_table_wTPM$target_id.y<-NULL sleuth_table_wTPM<-sleuth_table_wTPM[!duplicated(sleuth_table_wTPM), ] #Remove duplicates sleuth_table_wTPM<-left_join(sleuth_table_wTPM, wt, by=c("gene_id")) sleuth_table_wTPM<-left_join(sleuth_table_wTPM, sh, by=c("gene_id")) sleuth_table_wTPM$tpm_wt_by_sh=log2((sleuth_table_wTPM$tpm_wt+0.1)/(sleuth_table_wTPM$tpm_sh+0.1)) sleuth_significant <- dplyr::filter(sleuth_table_wTPM, qval <= 0.05) sleuth_significant$target_id.x<-NULL write.csv(sleuth_significant, file = "/data/projects/roni_rnaseq_de_august2018/tables/significant3D_wt_vs_sh.csv", quote=FALSE,row.names=FALSE) write.csv(sleuth_table_wTPM, file = "/data/projects/roni_rnaseq_de_august2018/tables/sleuth_table_wTPM_3D_wt_vs_sh.csv", quote=FALSE,row.names=FALSE)