## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----QuickStart, eval=TRUE, message=FALSE, results='hide'---------------------
library(proActiv)

## List of STAR junction files as input
files <- list.files(system.file('extdata/vignette/junctions', 
                                package = 'proActiv'), full.names = TRUE)
## Vector describing experimental condition
condition <- rep(c('A549','HepG2'), each=3)
## Promoter annotation for human genome GENCODE v34 restricted to a subset of chr1
promoterAnnotation <- promoterAnnotation.gencode.v34.subset

result <- proActiv(files = files, 
                   promoterAnnotation = promoterAnnotation,
                   condition = condition)

## ----List files, eval=FALSE---------------------------------------------------
#  files <- list.files(system.file('extdata/vignette/junctions',
#                                  package = 'proActiv'), full.names = TRUE)

## ----Create annotation, eval=FALSE--------------------------------------------
#  ## From GTF file path
#  gtf.file <- system.file('extdata/vignette/annotation/gencode.v34.annotation.subset.gtf.gz',
#                          package = 'proActiv')
#  promoterAnnotation.gencode.v34.subset <- preparePromoterAnnotation(file = gtf.file,
#                                                                     species = 'Homo_sapiens')
#  ## From TxDb object
#  txdb.file <- system.file('extdata/vignette/annotation/gencode.v34.annotation.subset.sqlite',
#                           package = 'proActiv')
#  txdb <- loadDb(txdb.file)
#  promoterAnnotation.gencode.v34.subset <- preparePromoterAnnotation(txdb = txdb,
#                                                                     species = 'Homo_sapiens')

## ----Other species, eval=TRUE-------------------------------------------------
names(GenomeInfoDb::genomeStyles())

## ----proActiv, eval=FALSE, message=FALSE, results='hide'----------------------
#  promoterAnnotation <- promoterAnnotation.gencode.v34.subset
#  
#  condition <- rep(c('A549', 'HepG2'), each=3)
#  
#  result <- proActiv(files = files,
#                     promoterAnnotation = promoterAnnotation,
#                     condition = condition)

## ----proActiv result, eval=TRUE-----------------------------------------------
show(result)

## ----Result rowData, eval=TRUE, echo=FALSE------------------------------------
knitr::kable(head(rowData(result)))

## ----Filter result, eval=TRUE-------------------------------------------------
## Removes single-exon transcripts / promoters by eliminating promoter counts that are NA 
result <- result[complete.cases(assays(result)$promoterCounts),]

## ----Get alternative, eval=TRUE-----------------------------------------------
alternativePromoters <- getAlternativePromoters(result = result, referenceCondition = "A549")
show(alternativePromoters)

## ----Boxplot, eval=TRUE, message=FALSE, fig.align='center', fig.height=5, fig.width=7----
plots <- boxplotPromoters(result, "ENSG00000076864.19")

# Boxplot of absolute promoter activity
library(gridExtra)
grid.arrange(plots[[1]], plots[[3]], nrow = 1, ncol = 2, widths = c(3, 2))

## ----VizPromoterCategories, eval=TRUE, fig.align='center', fig.height=5, fig.width=5, message=FALSE, warning=FALSE----
library(ggplot2)

rdata <- rowData(result)
## Create a long dataframe summarizing cell line and promoter class
pdata1 <- data.frame(cellLine = rep(c('A549', 'HepG2'), each = nrow(rdata)),
                       promoterClass = as.factor(c(rdata$A549.class, rdata$HepG2.class)))

ggplot(na.omit(pdata1)) +
  geom_bar(aes(x = cellLine, fill = promoterClass)) + 
  xlab('Cell Lines') + ylab('Count') +  labs(fill = 'Promoter Category') +
  ggtitle('Categorization of Promoters')

## ----VizMajorMinorPosition, eval=TRUE, fig.align='center', fig.height=5, fig.width=5, message=FALSE, warning=FALSE----
## Because many genes have many annotated promoters, we collapse promoters 
## from the 5th position and onward into one group for simplicity
pdata2 <- as_tibble(rdata) %>%
  mutate(promoterPosition = ifelse(promoterPosition > 5, 5, promoterPosition)) %>%
  filter(HepG2.class %in% c('Major', 'Minor'))

ggplot(pdata2) +
  geom_bar(aes(x = promoterPosition, fill = as.factor(HepG2.class)), position = 'fill') +
  xlab(expression(Promoter ~ Position ~ "5'" %->% "3'")) + ylab('Percentage') + 
  labs(fill = 'Promoter Category') + ggtitle('Major/Minor Promoter Proportion in HepG2') + 
  scale_y_continuous(breaks = seq(0,1, 0.25), labels = paste0(seq(0,100,25),'%')) +
  scale_x_continuous(breaks = seq(1,5), labels = c('1','2','3','4','>=5'))

## ----VizMajorGeneExp, eval=TRUE, fig.align='center', fig.height=5, fig.width=6.5, message=FALSE, warning=FALSE----
## Get active major promoters of HepG2
majorPromoter <- as_tibble(rdata) %>% group_by(geneId) %>% 
  mutate(promoterCount = n()) %>% filter(HepG2.class == 'Major') 

pdata3 <- data.frame(proActiv = majorPromoter$HepG2.mean,
                     geneExp = majorPromoter$HepG2.gene.mean,
                     promoterCount = majorPromoter$promoterCount)

ggplot(pdata3, aes(x = geneExp, y = proActiv)) + 
  geom_point(aes(colour = promoterCount), alpha = 0.5) +
  ggtitle('Major Promoter Activity vs. Gene Expression in HepG2') + 
  xlab('Average Gene Expression') + ylab('Average Major Promoter Activity') +
  labs(colour = 'Number of \n Annotated Promoters') +
  geom_abline(slope = 1, intercept = 0, colour = 'red', linetype = 'dashed')

## ----VizTsne, eval=TRUE, fig.align='center', fig.height=5.2, fig.width=5.2----
library(Rtsne)

## Remove inactive promoters (sparse rows)
data <- assays(result)$absolutePromoterActivity %>% filter(rowSums(.) > 0)
data <- data.frame(t(data))
data$Sample <- as.factor(condition)

set.seed(40) # for reproducibility

tsne.out <- Rtsne(as.matrix(subset(data, select = -c(Sample))), perplexity = 1)
plot(x = tsne.out$Y[,1], y = tsne.out$Y[,2], bg = data$Sample, asp = 1,
     col = 'black', pch = 24, cex = 4,
     main = 't-SNE plot with promoters \n active in at least one sample',
     xlab = 'T-SNE1', ylab = 'T-SNE2',
     xlim = c(-300,300), ylim = c(-300,300))
legend('topright', inset = .02, title = 'Cell Lines',
       unique(condition), pch = c(24,24), pt.bg = 1:length(unique(condition)) , cex = 1.5, bty = 'n')

## ----SessionInfo, eval=TRUE, echo=FALSE---------------------------------------
sessionInfo()