## ----options, results='hide', echo=FALSE--------------------------------------
#options(width=65)
options('useFancyQuotes' = FALSE, continue = " ", digits = 3)

## ----cite, eval=TRUE----------------------------------------------------------
citation("QuasR")

## ----install, eval=FALSE------------------------------------------------------
#  if (!require("BiocManager"))
#      install.packages("BiocManager")
#  BiocManager::install("QuasR")

## ----loadLibraries, eval=TRUE-------------------------------------------------
suppressPackageStartupMessages({
    library(QuasR)
    library(BSgenome)
    library(Rsamtools)
    library(rtracklayer)
    library(GenomicFeatures)
    library(txdbmaker)
    library(Gviz)
})

## ----help1, eval=FALSE--------------------------------------------------------
#  help.start()

## ----loadQuasRLibrary, eval=FALSE---------------------------------------------
#  library(QuasR)

## ----help2, eval=FALSE--------------------------------------------------------
#  ?preprocessReads

## ----help3, eval=FALSE--------------------------------------------------------
#  help("preprocessReads")

## ----assign, eval=FALSE-------------------------------------------------------
#  x <- 2

## ----ls, eval=FALSE-----------------------------------------------------------
#  ls()

## ----printObject, eval=FALSE--------------------------------------------------
#  x

## ----SampleSession1, eval=TRUE------------------------------------------------
file.copy(system.file(package = "QuasR", "extdata"), ".", recursive = TRUE)

## ----SampleSession2, eval=TRUE------------------------------------------------
sampleFile <- "extdata/samples_chip_single.txt"
genomeFile <- "extdata/hg19sub.fa"

proj <- qAlign(sampleFile, genomeFile)
proj

## ----SampleSession3, eval=TRUE------------------------------------------------
qQCReport(proj, "extdata/qc_report.pdf")

## ----SampleSession4, eval=TRUE------------------------------------------------
library(rtracklayer)
library(GenomicFeatures)
annotFile <- "extdata/hg19sub_annotation.gtf"
txStart <- import.gff(annotFile, format = "gtf", feature.type = "start_codon")
promReg <- promoters(txStart, upstream = 500, downstream = 500)
names(promReg) <- mcols(promReg)$transcript_name

promCounts <- qCount(proj, query = promReg)
promCounts

## ----sampleFileSingle, echo=FALSE, results="asis"-----------------------------
cat(paste(readLines(system.file(package = "QuasR",
                                "extdata", "samples_chip_single.txt")),
          collapse = "\n"))

## ----sampleFilePaired, echo=FALSE, results="asis"-----------------------------
cat(paste(readLines(system.file(package = "QuasR",
                                "extdata", "samples_rna_paired.txt")),
      collapse = "\n"))

## ----sampleFile, eval=FALSE---------------------------------------------------
#  sampleFile1 <- system.file(package="QuasR", "extdata",
#                             "samples_chip_single.txt")
#  sampleFile2 <- system.file(package="QuasR", "extdata",
#                             "samples_rna_paired.txt")

## ----<sampleFileSeqToBam, eval=FALSE------------------------------------------
#  sampleFile1 <- "samples_fastq.txt"
#  sampleFile2 <- "samples_bam.txt"
#  
#  proj1 <- qAlign(sampleFile1, genomeFile)
#  
#  write.table(alignments(proj1)$genome, sampleFile2, sep="\t", row.names=FALSE)
#  
#  proj2 <- qAlign(sampleFile2, genomeFile)

## ----auxFile, echo=FALSE, results="asis"--------------------------------------
cat(paste(readLines(system.file(package = "QuasR",
                                "extdata", "auxiliaries.txt")),
          collapse = "\n"))

## ----auxiliaryFile, eval=TRUE-------------------------------------------------
auxFile <- system.file(package = "QuasR", "extdata", "auxiliaries.txt")

## ----selectGenomeBSgenome, eval=TRUE------------------------------------------
available.genomes()
genomeName <- "BSgenome.Hsapiens.UCSC.hg19"

## ----selectGenomeFile, eval=FALSE---------------------------------------------
#  genomeFile <- system.file(package="QuasR", "extdata", "hg19sub.fa")

## ----preprocessReadsSingle,eval=TRUE------------------------------------------
td <- tempdir()
infiles <- system.file(package = "QuasR", "extdata",
                       c("rna_1_1.fq.bz2","rna_2_1.fq.bz2"))
outfiles <- file.path(td, basename(infiles))
res <- preprocessReads(filename = infiles,
                       outputFilename = outfiles,
                       truncateEndBases = 3,
                       Lpattern = "AAAAAAAAAA",
                       minLength = 14, 
                       nBases = 2)
res
unlink(outfiles)

## ----preprocessReadsPaired,eval=TRUE------------------------------------------
td <- tempdir()
infiles1 <- system.file(package = "QuasR", "extdata", "rna_1_1.fq.bz2")
infiles2 <- system.file(package = "QuasR", "extdata", "rna_1_2.fq.bz2")
outfiles1 <- file.path(td, basename(infiles1))
outfiles2 <- file.path(td, basename(infiles2))
res <- preprocessReads(filename = infiles1,
                       filenameMate = infiles2,
                       outputFilename = outfiles1,
                       outputFilenameMate = outfiles2,
                       nBases = 0)
res

## ----ChIP_copyExtdata, eval=TRUE----------------------------------------------
file.copy(system.file(package = "QuasR", "extdata"), ".", recursive = TRUE)

## ----ChIP_qAlign, eval=TRUE---------------------------------------------------
sampleFile <- "extdata/samples_chip_single.txt"
auxFile <- "extdata/auxiliaries.txt"
genomeFile <- "extdata/hg19sub.fa"

proj1 <- qAlign(sampleFile, genome = genomeFile, auxiliaryFile = auxFile)
proj1

## ----ChIP_bamfiles1, eval=TRUE------------------------------------------------
list.files("extdata", pattern = ".bam$")

## ----ChIP_bamfiles2, eval=TRUE------------------------------------------------
list.files("extdata", pattern = "^chip_1_1_")[1:3]

## ----ChIP_qcplot1, eval=TRUE, echo=FALSE--------------------------------------
qcdat1 <- qQCReport(proj1, pdfFilename = "extdata/qc_report.pdf")

## ----ChIP_qcplots2, eval=TRUE-------------------------------------------------
qQCReport(proj1, pdfFilename = "extdata/qc_report.pdf")

## ----ChIP_alignmentStats, eval=TRUE-------------------------------------------
alignmentStats(proj1)

## ----ChIP_qExportWig, eval=TRUE-----------------------------------------------
qExportWig(proj1, binsize = 100L, scaling = TRUE, collapseBySample = TRUE)

## ----ChIP_GenomicFeatures, eval=TRUE------------------------------------------
library(txdbmaker)
annotFile <- "extdata/hg19sub_annotation.gtf"
chrLen <- scanFaIndex(genomeFile)
chrominfo <- data.frame(chrom = as.character(seqnames(chrLen)),
                        length = width(chrLen),
                        is_circular = rep(FALSE, length(chrLen)))
txdb <- makeTxDbFromGFF(file = annotFile, format = "gtf",
                        chrominfo = chrominfo,
                        dataSource = "Ensembl",
                        organism = "Homo sapiens")
promReg <- promoters(txdb, upstream = 1000, downstream = 500,
                     columns = c("gene_id","tx_id"))
gnId <- vapply(mcols(promReg)$gene_id,
               FUN = paste, FUN.VALUE = "",
               collapse = ",")
promRegSel <- promReg[ match(unique(gnId), gnId) ]
names(promRegSel) <- unique(gnId)
promRegSel

## ----ChIP_qCount, eval=TRUE---------------------------------------------------
cnt <- qCount(proj1, promRegSel)
cnt

## ----ChIP_visualize, eval=TRUE, fig.width=8, fig.height=4.5-------------------
gr1 <- import("Sample1.wig.gz")
gr2 <- import("Sample2.wig.gz")

library(Gviz)
axisTrack <- GenomeAxisTrack()
dTrack1 <- DataTrack(range = gr1, name = "Sample 1", type = "h")
dTrack2 <- DataTrack(range = gr2, name = "Sample 2", type = "h")
txTrack <- GeneRegionTrack(txdb, name = "Transcripts", showId = TRUE)
plotTracks(list(axisTrack, dTrack1, dTrack2, txTrack),
           chromosome = "chr3", extend.left = 1000)

## ----ChIP_rtracklayer, eval=TRUE----------------------------------------------
library(rtracklayer)
annotationFile <- "extdata/hg19sub_annotation.gtf"
tssRegions <- import.gff(annotationFile, format = "gtf",
                         feature.type = "start_codon",
                         colnames = "gene_name")
tssRegions <- tssRegions[!duplicated(tssRegions)]
names(tssRegions) <- rep("TSS", length(tssRegions))
head(tssRegions)

## ----ChIP_qProfile, eval=TRUE-------------------------------------------------
prS <- qProfile(proj1, tssRegions, upstream = 3000, downstream = 3000, 
                orientation = "same")
prO <- qProfile(proj1, tssRegions, upstream = 3000, downstream = 3000, 
                orientation = "opposite")
lapply(prS, "[", , 1:10)

## ----ChIP_visualizeProfile, eval=TRUE, fig.width=8, fig.height=4.5------------
prCombS <- do.call("+", prS[-1]) / prS[[1]]
prCombO <- do.call("+", prO[-1]) / prO[[1]]

plot(as.numeric(colnames(prCombS)), filter(prCombS[1,], rep(1/100,100)), 
     type = 'l', xlab = "Position relative to TSS",
     ylab = "Mean no. of alignments")
lines(as.numeric(colnames(prCombO)), filter(prCombO[1,], rep(1/100,100)), 
      type = 'l', col = "red")
legend(title = "strand", legend = c("same as query","opposite of query"), 
       x = "topleft", col = c("black","red"),
       lwd = 1.5, bty = "n", title.adj = 0.1)

## ----ChIP_BSgenomeProject, eval=FALSE-----------------------------------------
#  file.copy(system.file(package="QuasR", "extdata"), ".", recursive=TRUE)
#  
#  sampleFile <- "extdata/samples_chip_single.txt"
#  auxFile <- "extdata/auxiliaries.txt"
#  
#  available.genomes() # list available genomes
#  genomeName <- "BSgenome.Hsapiens.UCSC.hg19"
#  
#  proj1 <- qAlign(sampleFile, genome=genomeName, auxiliaryFile=auxFile)
#  proj1

## ----RNA_qAlign, eval=TRUE----------------------------------------------------
file.copy(system.file(package = "QuasR", "extdata"), ".", recursive = TRUE)

sampleFile <- "extdata/samples_rna_paired.txt"
genomeFile <- "extdata/hg19sub.fa"

proj2 <- qAlign(sampleFile, genome = genomeFile,
                splicedAlignment = TRUE, aligner = "Rhisat2")
proj2

## ----RNA_alignmentStats, eval=TRUE--------------------------------------------
proj2unspl <- qAlign(sampleFile, genome = genomeFile,
                     splicedAlignment = FALSE)

alignmentStats(proj2)
alignmentStats(proj2unspl)

## ----RNA_qCount, eval=TRUE----------------------------------------------------
geneLevels <- qCount(proj2, txdb, reportLevel = "gene")
exonLevels <- qCount(proj2, txdb, reportLevel = "exon")

head(geneLevels)
head(exonLevels)

## ----RNA_RPMK, eval=TRUE------------------------------------------------------
geneRPKM <- t(t(geneLevels[,-1] / geneLevels[,1] * 1000)
              / colSums(geneLevels[,-1]) * 1e6)
geneRPKM

## ----RNA_junction, eval=TRUE--------------------------------------------------
exonJunctions <- qCount(proj2, NULL, reportLevel = "junction")
exonJunctions

## ----RNA_junction2, eval=TRUE-------------------------------------------------
knownIntrons <- unlist(intronsByTranscript(txdb))
isKnown <- overlapsAny(exonJunctions, knownIntrons, type = "equal")
table(isKnown)

tapply(rowSums(as.matrix(mcols(exonJunctions))),
       isKnown, summary)

## ----RNA_includeSpliced, eval=TRUE--------------------------------------------
exonBodyLevels <- qCount(proj2, txdb, reportLevel = "exon",
                         includeSpliced = FALSE)
summary(exonLevels - exonBodyLevels)

## ----RNA_qcplot1, eval=TRUE, echo=FALSE---------------------------------------
qcdat2 <- qQCReport(proj2unspl, pdfFilename = "qc_report.pdf")

## ----miRNA_extdata, eval=TRUE-------------------------------------------------
file.copy(system.file(package = "QuasR", "extdata"), ".", recursive = TRUE)

## ----miRNA_preprocessReads, eval=TRUE-----------------------------------------
# prepare sample file with processed reads filenames
sampleFile <- file.path("extdata", "samples_mirna.txt")
sampleFile
sampleFile2 <- sub(".txt", "_processed.txt", sampleFile)
sampleFile2

tab <- read.delim(sampleFile, header = TRUE, as.is = TRUE)
tab

tab2 <- tab
tab2$FileName <- sub(".fa", "_processed.fa", tab$FileName)
write.table(tab2, sampleFile2, sep = "\t", quote = FALSE, row.names = FALSE)
tab2

# remove adapters
oldwd <- setwd(dirname(sampleFile))
res <- preprocessReads(tab$FileName,
                       tab2$FileName,
                       Rpattern = "TGGAATTCTCGGGTGCCAAGG")
res
setwd(oldwd)

## ----miRNA_lengthes, eval=TRUE, fig.width=6, fig.height=4.5-------------------
# get read lengths
library(Biostrings)
oldwd <- setwd(dirname(sampleFile))
lens <- fasta.seqlengths(tab$FileName, nrec = 1e5)
lens2 <- fasta.seqlengths(tab2$FileName, nrec = 1e5)
setwd(oldwd)
# plot length distribution
lensTab <- rbind(raw = tabulate(lens, 50),
                 processed = tabulate(lens2, 50))
colnames(lensTab) <- 1:50
barplot(lensTab/rowSums(lensTab)*100,
        xlab = "Read length (nt)", ylab = "Percent of reads")
legend(x = "topleft", bty = "n", fill = gray.colors(2),
       legend = rownames(lensTab))

## ----miRNA_qAlign, eval=TRUE--------------------------------------------------
proj3 <- qAlign(sampleFile2, genomeFile, maxHits = 50)
alignmentStats(proj3)

## ----miRNA_prepareQuery, eval=TRUE--------------------------------------------
mirs <- import("extdata/mirbaseXX_qsr.gff3")
names(mirs) <- mirs$Name
preMirs <- mirs[ mirs$type == "miRNA_primary_transcript" ]
matureMirs <- mirs[ mirs$type == "miRNA" ]

## ----miRNA_coverage, eval=TRUE, fig.width=6, fig.height=4.5-------------------
library(Rsamtools)
alns <- scanBam(alignments(proj3)$genome$FileName,
                param = ScanBamParam(what = scanBamWhat(),
                                     which = preMirs[1]))[[1]]
alnsIR <- IRanges(start = alns$pos - start(preMirs), width = alns$qwidth)
mp <- barplot(as.vector(coverage(alnsIR)), names.arg = seq_len(max(end(alnsIR))),
              xlab = "Relative position in pre-miRNA",
              ylab = "Alignment coverage")
rect(xleft = mp[start(matureMirs) - start(preMirs) + 1,1],
     ybottom = -par('cxy')[2],
     xright = mp[end(matureMirs) - start(preMirs) + 1,1],
     ytop = 0, col = "#CCAA0088", border = NA, xpd = NA)

## ----miRNA_extendQuery, eval=TRUE---------------------------------------------
matureMirsExtended <- resize(matureMirs, width = 1L, fix = "start") + 3L

## ----miRNA_quantify, eval=TRUE------------------------------------------------
# quantify mature miRNAs
cnt <- qCount(proj3, matureMirsExtended, orientation = "same")
cnt

# quantify pre-miRNAs
cnt <- qCount(proj3, preMirs, orientation = "same")
cnt

## ----Bis_qAlign, eval=TRUE----------------------------------------------------
file.copy(system.file(package = "QuasR", "extdata"), ".", recursive = TRUE)

sampleFile <- "extdata/samples_bis_single.txt"
genomeFile <- "extdata/hg19sub.fa"

proj4 <- qAlign(sampleFile, genomeFile, bisulfite = "dir")
proj4

## ----Bis_qMeth1, eval=TRUE----------------------------------------------------
meth <- qMeth(proj4, mode = "CpGcomb", collapseBySample = TRUE)
meth

## ----Bis_qMeth2, eval=TRUE----------------------------------------------------
chrs <- readDNAStringSet(genomeFile)
sum(vcountPattern("CG",chrs))
length(qMeth(proj4))
length(qMeth(proj4, keepZero = FALSE))

## ----Bis_visualize, eval=TRUE, fig.height=4.5, fig.width=8--------------------
percMeth <- mcols(meth)[,2] * 100 / mcols(meth)[,1]
summary(percMeth)

axisTrack <- GenomeAxisTrack()
dTrack1 <- DataTrack(range = gr1, name = "H3K4me3", type = "h")
dTrack2 <- DataTrack(range = meth, data = percMeth,
                     name = "Methylation", type = "p")
txTrack <- GeneRegionTrack(txdb, name = "Transcripts", showId = TRUE)
plotTracks(list(axisTrack, dTrack1, dTrack2, txTrack),
           chromosome = "chr3", extend.left = 1000)

## ----Bis_query, eval=TRUE-----------------------------------------------------
qMeth(proj4, query = GRanges("chr1",IRanges(start = 31633, width = 2)),
      collapseBySample = TRUE)
qMeth(proj4, query = promRegSel, collapseByQueryRegion = TRUE,
      collapseBySample = TRUE)

## ----snpFile, echo=FALSE, results="asis"--------------------------------------
cat(paste(c(readLines(system.file(package = "QuasR",
                                  "extdata", "hg19sub_snp.txt"))[1:4], "..."),
          collapse = "\n"))

## ----Alelle_Extdata, eval=TRUE------------------------------------------------
file.copy(system.file(package = "QuasR", "extdata"), ".", recursive = TRUE)

## ----Allele_qAlign, eval=TRUE-------------------------------------------------
sampleFile <- "extdata/samples_chip_single.txt"
genomeFile <- "extdata/hg19sub.fa"
snpFile <- "extdata/hg19sub_snp.txt"
proj1SNP <- qAlign(sampleFile, genome = genomeFile, snpFile = snpFile)
proj1SNP

## ----Allele_qCount, eval=TRUE-------------------------------------------------
head(qCount(proj1, promRegSel))
head(qCount(proj1SNP, promRegSel))

## ----Allele_Bis, eval=TRUE----------------------------------------------------
sampleFile <- "extdata/samples_bis_single.txt"
genomeFile <- "extdata/hg19sub.fa"
proj4SNP <- qAlign(sampleFile, genomeFile,
                   snpFile = snpFile, bisulfite = "dir")
head(qMeth(proj4SNP, mode = "CpGcomb", collapseBySample = TRUE))

## ----qcplotsFig1, eval=TRUE, echo=FALSE, fig.height=4, fig.width=8------------
QuasR:::plotQualByCycle(qcdat1$raw$qa, lmat = rbind(1:2))

## ----qcplotsFig2, eval=TRUE, echo=FALSE, fig.height=4, fig.width=8------------
QuasR:::plotNuclByCycle(qcdat1$raw$qa, lmat = rbind(1:2))

## ----qcplotsFig3, eval=TRUE, echo=FALSE, fig.height=4, fig.width=8------------
QuasR:::plotDuplicated(qcdat1$raw$qa, lmat = rbind(1:2))

## ----qcplotsFig4, eval=TRUE, echo=FALSE, fig.height=4, fig.width=8------------
QuasR:::plotMappings(qcdat1$raw$mapdata, a4layout = FALSE)

## ----qcplotsFig5, eval=TRUE, echo=FALSE, fig.height=4, fig.width=8------------
QuasR:::plotUniqueness(qcdat1$raw$unique, a4layout = FALSE)

## ----qcplotsFig6, eval=TRUE, echo=FALSE, fig.height=4, fig.width=8------------
QuasR:::plotErrorsByCycle(qcdat1$raw$mm, lmat = rbind(1:2))

## ----qcplotsFig7, eval=TRUE, echo=FALSE, fig.height=4, fig.width=8------------
QuasR:::plotMismatchTypes(qcdat1$raw$mm, lmat = rbind(1:2))

## ----qcplotsFig8, eval=TRUE, echo=FALSE, fig.height=4, fig.width=8------------
QuasR:::plotFragmentDistribution(qcdat2$raw$frag, lmat = rbind(1:2))

## ----alignmentStats, eval=TRUE------------------------------------------------
# using bam files
alignmentStats(alignments(proj1)$genome$FileName)
alignmentStats(unlist(alignments(proj1)$aux))

# using a qProject object
alignmentStats(proj1)

## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()

## ----cleanUp, eval=TRUE, echo=FALSE-------------------------------------------
unlink("extdata", recursive = TRUE, force = TRUE)