#source("https://bioconductor.org/biocLite.R")
#biocLite("tximport")
library("tximport")
library("tximportData")
library("dplyr")
library("tidyr")
library("DESeq2")


sample_id <- dir(file.path("/data/projects/roni_rnaseq_de_august2018/", "data_quant2"))
itsv <- file.path("/data/projects/roni_rnaseq_de_august2018", "data_quant2", sample_id[1], "quantifications/kallisto/hg38_mmtv/paired_end/abundance.tsv")

target_mapping <- read.delim(itsv)

# extract information from target_id
target_mapping$target_id_raw <- target_mapping$target_id
target_id_names <- c('transcript_id', 'gene_id', 'ott1', 'ott2', 'transcript_name', 'gene_name', 'transcript_length', 'gene_type', 'empty')
target_mapping <- target_mapping %>% separate(target_id_raw, target_id_names, "\\|")

# select specific columns
target_mapping <- target_mapping[, c("target_id", "gene_id", 'gene_name')]
tx2gene<-target_mapping[,c("target_id","gene_id")]


kal_dirs <- file.path("/data/projects/roni_rnaseq_de_august2018", "data_quant2", sample_id, "quantifications/kallisto/hg38_mmtv/paired_end/abundance.h5")
names(kal_dirs)<-sample_id

s2c <- read.table(file.path("/data/projects/roni_rnaseq_de_august2018", "metadata", "hiseq_info2.txt"), header = TRUE, stringsAsFactors=FALSE)
s2c <- dplyr::select(s2c, sample = sample, condition)

s2c <- dplyr::mutate(s2c, path = kal_dirs)
print(s2c) #Full table
row.names(s2c)<-sample_id
names(kal_dirs) <- sample_id


'''
Import kallisto abundances 
kallisto abundance.h5 files can be imported by setting type to "kallisto". 
Note that this requires that you have the Bioconductor package rhdf5 installed. 
'''

txi.kallisto <- tximport::tximport(s2c$path, type = "kallisto", tx2gene=tx2gene)
head(txi.kallisto$counts)

write.csv(txi.kallisto$counts, file = "/data/projects/roni_rnaseq_de_august2018/tables/rw34-rw38_kallisto_counts.csv", quote=FALSE,row.names=TRUE)