```{r }
# load R libraries
library("sleuth")
library('cowplot')
library("RSQLite")
library("dbplyr")
library("dplyr")
library("tidyr")
library("tximportData")
library("rhdf5")
library("knitr")
library("rmarkdown")
new_position_theme <- theme(legend.position = c(0.80, 0.90))


#==================================================================================================
# Import GENCODE V24 annotation
#==================================================================================================

# transcript quantifications are generated with kallisto using GENCODE V24 annotation
# gene and transcript ids can be obtained from the kallisto's quantification file for any of the samples

# read in file
sample_id <- dir(file.path("/data/projects/roni_rnaseq_de_august2018/", "data_quant"))
itsv <- file.path("/data/projects/roni_rnaseq_de_august2018", "data_quant", sample_id[1], "quantifications/kallisto/hg38_mmtv/paired_end/abundance.tsv")

target_mapping <- read.delim(itsv)

# extract information from target_id
target_mapping$target_id_raw <- target_mapping$target_id
target_id_names <- c('transcript_id', 'gene_id', 'ott1', 'ott2', 'transcript_name', 'gene_name', 'transcript_length', 'gene_type', 'empty')
target_mapping <- target_mapping %>% separate(target_id_raw, target_id_names, "\\|")

# select specific columns
target_mapping <- target_mapping[, c("target_id", "gene_id", 'gene_name')]


kal_dirs <- file.path("/data/projects/roni_rnaseq_de_august2018", "data_quant", sample_id, "quantifications/kallisto/hg38_mmtv/paired_end/")
kal_dirs

s2c <- read.table(file.path("/data/projects/roni_rnaseq_de_august2018", "metadata", "hiseq_info.txt"), header = TRUE, stringsAsFactors=FALSE)
s2c <- dplyr::select(s2c, sample = sample, condition)

s2c <- dplyr::mutate(s2c, path = kal_dirs)
print(s2c) #Full table

so <- sleuth_prep(s2c, ~ condition, target_mapping=target_mapping, extra_bootstrap_summary = TRUE) 

png("/data/projects/roni_rnaseq_de_august2018/figures/pca_tpm_all_samples.png", res = 150, w = 1500, h = 500)
par(mfrow = c(1, 2))
p1 <- plot_pca(so, pc_x = 1L, pc_y = 2L, units = 'tpm', color_by = 'condition') +
  new_position_theme
p2 <- plot_pc_variance(so, units = 'tpm', PC_relative = TRUE)
plot_grid(p1, p2, align = "h")
dev.off()

png("/data/projects/roni_rnaseq_de_august2018/figures/group_density.png", res = 150, w = 1500, h = 500)
plot_group_density(so, use_filtered = TRUE, units = "est_counts",
                   trans = "log", grouping = setdiff(colnames(so$sample_to_covariates),
                                                     "sample"), offset = 1)
dev.off()
 
#First comparison 2D_wt vs 2D_shx

condA <- which(s2c$condition == "2D_wt")
condB <- which(s2c$condition == "2D_sh")
s2c_AvsB <- s2c[c(condA,condB),]
print(s2c_AvsB) #2D_wt vs 2D_shx

#Aggregate transcripts into genes
so <- sleuth_prep(s2c_AvsB, ~ condition, target_mapping=target_mapping, aggregation_column = 'gene_name', extra_bootstrap_summary = TRUE) 
#Not aggregating transcripts
#so <- sleuth_prep(s2c_AvsB, ~ condition, target_mapping=target_mapping, extra_bootstrap_summary = TRUE) 

#PCA PLOT with 2D samples
png("/data/projects/roni_rnaseq_de_august2018/figures/pca_tpm_2D.png", res = 150, w = 1500, h = 500)
par(mfrow = c(1, 2))
p1 <- plot_pca(so, pc_x = 1L, pc_y = 2L, units = 'tpm', color_by = 'condition') +
  new_position_theme
p2 <- plot_pc_variance(so, units = 'tpm', PC_relative = TRUE)
plot_grid(p1, p2, align = "h")
dev.off()

so <- sleuth_fit(so, ~condition, 'full')
so <- sleuth_fit(so, ~1, 'reduced')

# LTR test (to obtain the q-values):
so <- sleuth_lrt(so, 'reduced', 'full')
models(so)

# WT test (to obtain the beta values). NOT WORKING because we aggregate at gene level
#so <- sleuth_wt(so, 'condition2D_wt')


res_lrt <- sleuth_results(so, 'reduced:full', test_type = 'lrt')
#res_wt <- sleuth_results(so, 'condition2D_wt', 'wt',"full", show_all = TRUE)
#res <- merge(res_lrt, res_wt[, c('target_id', 'b', 'se_b', 'mean_obs')], on = 'target_id', sort = FALSE)

sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt')
#sleuth_table <- sleuth_results(so, 'condition2D_wt', 'wt')
sleuth_significant <- dplyr::filter(sleuth_table, qval <= 0.05)
#Number of DE genes
nrow(sleuth_significant)


#Add information on up/down regulation. Calculating means from samples.
temphigh<-s2c_AvsB$sample[s2c_AvsB$condition=="2D_wt"]
templow<-s2c_AvsB$sample[s2c_AvsB$condition=="2D_sh"]
hiraw<-subset(so$obs_raw,so$obs_raw$sample %in% temphigh)
lowraw<-subset(so$obs_raw,so$obs_raw$sample %in% templow)
highmeans<-aggregate(tpm~target_id,data=hiraw,FUN=function(x) c(mean=mean(x)))
colnames(highmeans)<-c("target_id","tpm_wt")
lowmeans<-aggregate(tpm~target_id,data=lowraw,FUN=function(x) c(mean=mean(x)))
colnames(lowmeans)<-c("target_id","tpm_sh")
merged_means<-merge(lowmeans,highmeans,by=c("target_id"))
merged_means2<-merge(merged_means,target_mapping,by=c("target_id"))
sleuth_table_wTPM<-left_join(sleuth_table, merged_means2, by=c("gene_id"))

wt<-aggregate(tpm_wt ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE)
sh<-aggregate(tpm_sh ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE)

sleuth_table_wTPM$tpm_sh<-NULL
sleuth_table_wTPM$tpm_wt<-NULL
sleuth_table_wTPM$target_id.y<-NULL
sleuth_table_wTPM<-sleuth_table_wTPM[!duplicated(sleuth_table_wTPM), ] #Remove duplicates
sleuth_table_wTPM<-left_join(sleuth_table_wTPM, wt, by=c("gene_id"))
sleuth_table_wTPM<-left_join(sleuth_table_wTPM, sh, by=c("gene_id"))
sleuth_table_wTPM$tpm_wt_by_sh=(sleuth_table_wTPM$tpm_wt+0.1)/(sleuth_table_wTPM$tpm_sh+0.1)

sleuth_significant <- dplyr::filter(sleuth_table_wTPM, qval <= 0.05)
sleuth_significant$target_id.x<-NULL
write.csv(sleuth_significant, file = "/data/projects/roni_rnaseq_de_august2018/tables/significant2D_wt_vs_sh.csv", quote=FALSE,row.names=FALSE)


#Second comparison 3D_wt vs 3D_sh
condA <- which(s2c$condition == "3D_wt")
condB <- which(s2c$condition == "3D_sh")
s2c_AvsB <- s2c[c(condA,condB),]
print(s2c_AvsB) #3D_wt vs 3D_sh

so <- sleuth_prep(s2c_AvsB, ~ condition, target_mapping=target_mapping, extra_bootstrap_summary = TRUE) 

#Plot PCA with only 3D samples
png("/data/projects/roni_rnaseq_de_august2018/figures/pca_tpm_3D.png", res = 150, w = 1500, h = 500)
par(mfrow = c(1, 2))
p1 <- plot_pca(so, pc_x = 1L, pc_y = 2L, units = 'tpm', color_by = 'condition') +
  new_position_theme
p2 <- plot_pc_variance(so, units = 'tpm', PC_relative = TRUE)
plot_grid(p1, p2, align = "h")
dev.off()

so <- sleuth_fit(so, ~condition, 'full')
so <- sleuth_fit(so, ~1, 'reduced')
models(so)
so <- sleuth_lrt(so, 'reduced', 'full')


sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = FALSE)
sleuth_significant <- dplyr::filter(sleuth_table, qval <= 0.05)
head(sleuth_significant, 20)
```


---
title: "differential_expression_JL.R"
author: "jl"
date: "Thu Aug  9 15:00:42 2018"
---