## ----style, echo = FALSE, results = 'asis'---------------- BiocStyle::markdown() options(width = 60, max.print = 1000) knitr::opts_chunk$set( eval = as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache = as.logical(Sys.getenv("KNITR_CACHE", "TRUE")), tidy.opts = list(width.cutoff = 60), tidy = TRUE ) ## ----setup, echo=FALSE, message=FALSE, warning=FALSE, eval=TRUE---- suppressPackageStartupMessages({ library(systemPipeR) }) ## ----genNew_wf, eval=FALSE-------------------------------- # systemPipeRdata::genWorkenvir(workflow = "varseq", mydirname = "varseq") # setwd("varseq") ## ----create_sal, message=FALSE, eval=FALSE---------------- # sal <- SPRproject() ## ----load_SPR, message=FALSE, eval=FALSE, spr=TRUE-------- # # Some samples in the test dataset do not work well in VARseq, and VARseq workflow # # takes long time to process each sample. To better test and speed up the test workflow, # # sample set is reduced to the first 13 samples. # # Please REMOVE the next two lines in your real analysis # cat(crayon::red$bold("Some samples in targets are removed for test workflow. Please change the template to disable this in your real analysis.\n")) # writeLines(readLines("targetsPE.txt")[1:13], "targetsPE.txt") # # cat(crayon::blue$bold("To use this workflow, following R packages are expected:\n")) # cat(c("'GenomicFeatures", "VariantAnnotation", "GenomicFeatures", "ggbio", "ggplot2'\n"), sep = "', '") # ###pre-end # appendStep(sal) <- LineWise( # code = { # library(systemPipeR) # }, # step_name = "load_SPR" # ) ## ----fastq_report_pre, eval=FALSE, message=FALSE, spr=TRUE---- # appendStep(sal) <- LineWise( # code = { # targets <- read.delim("targetsPE.txt", comment.char = "#") # updateColumn(sal, step = "load_SPR", position = "targetsWF") <- targets # fq_files <- getColumn(sal, "load_SPR", "targetsWF", column = 1) # fqlist <- seeFastq(fastq = fq_files, batchsize = 10000, klength = 8) # png("./results/fastqReport.png", height = 162, width = 288 * length(fqlist)) # seeFastqPlot(fqlist) # dev.off() # }, # step_name = "fastq_report_pre", # dependency = "load_SPR" # ) ## ----trimmomatic, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- SYSargsList( # step_name = "trimmomatic", # targets = "targetsPE.txt", # wf_file = "trimmomatic/trimmomatic-pe.cwl", # input_file = "trimmomatic/trimmomatic-pe.yml", # dir_path = "param/cwl", # inputvars = c( # FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", # SampleName = "_SampleName_" # ), # dependency = c("fastq_report_pre"), # run_step = "optional" # ) ## ----preprocessing, message=FALSE, eval=FALSE, spr=TRUE---- # appendStep(sal) <- SYSargsList( # step_name = "preprocessing", # targets = "targetsPE.txt", dir = TRUE, # wf_file = "preprocessReads/preprocessReads-pe.cwl", # input_file = "preprocessReads/preprocessReads-pe.yml", # dir_path = "param/cwl", # inputvars = c( # FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", # SampleName = "_SampleName_" # ), # dependency = c("fastq_report_pre"), # run_step = "optional" # ) ## ----custom_preprocessing_function, eval=FALSE------------ # appendStep(sal) <- LineWise( # code = { # filterFct <- function(fq, cutoff = 20, Nexceptions = 0) { # qcount <- rowSums(as(quality(fq), "matrix") <= cutoff, na.rm = TRUE) # # Retains reads where Phred scores are >= cutoff with N exceptions # fq[qcount <= Nexceptions] # } # save(list = ls(), file = "param/customFCT.RData") # }, # step_name = "custom_preprocessing_function", # dependency = "preprocessing" # ) ## ----editing_preprocessing, message=FALSE, eval=FALSE----- # yamlinput(sal, "preprocessing")$Fct # yamlinput(sal, "preprocessing", "Fct") <- "'filterFct(fq, cutoff=20, Nexceptions=0)'" # yamlinput(sal, "preprocessing")$Fct ## check the new function # cmdlist(sal, "preprocessing", targets = 1) ## check if the command line was updated with success ## ----fastq_report_pos, eval=FALSE, message=FALSE, spr=TRUE---- # appendStep(sal) <- LineWise( # code = { # fq_files <- getColumn(sal, "preprocessing", "outfiles", column = 1) ## get outfiles path # fqlist <- seeFastq(fastq = fq_files, batchsize = 10000, klength = 8) # png("./results/fastqReport_pos.png", height = 18, width = 4 * length(fqlist)) # seeFastqPlot(fqlist) # dev.off() # }, # step_name = "fastq_report_pos", # dependency = "trimmomatic", # run_step = "optional" # ) ## ----bwa_index, eval=FALSE, spr=TRUE---------------------- # appendStep(sal) <- SYSargsList( # step_name = "bwa_index", # dir = FALSE, targets = NULL, # wf_file = "gatk/workflow_bwa-index.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # dependency = "load_SPR" # ) ## ----fasta_index, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- SYSargsList( # step_name = "fasta_index", # dir = FALSE, targets = NULL, # wf_file = "gatk/workflow_fasta_dict.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # dependency = "bwa_index" # ) ## ----faidx_index, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- SYSargsList( # step_name = "faidx_index", # dir = FALSE, targets = NULL, # wf_file = "gatk/workflow_fasta_faidx.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # dependency = "fasta_index" # ) ## ----bwa_alignment, eval=FALSE, spr=TRUE------------------ # appendStep(sal) <- SYSargsList( # step_name = "bwa_alignment", # targets = "targetsPE.txt", # wf_file = "gatk/workflow_bwa-pe.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # inputvars = c( # FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", # SampleName = "_SampleName_" # ), # dependency = c("faidx_index") # ) ## ----align_stats, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- LineWise( # code = { # bampaths <- getColumn(sal, step = "bwa_alignment", "outfiles", column = "samtools_sort_bam") # fqpaths <- getColumn(sal, step = "bwa_alignment", "targetsWF", column = "FileName1") # read_statsDF <- alignStats(args = bampaths, fqpaths = fqpaths, pairEnd = TRUE) # write.table(read_statsDF, "results/alignStats.xls", row.names = FALSE, quote = FALSE, sep = "\t") # }, # step_name = "align_stats", # dependency = "bwa_alignment", # run_step = "optional" # ) ## ----bam_urls, eval=FALSE, spr=TRUE----------------------- # appendStep(sal) <- LineWise( # code = { # bampaths <- getColumn(sal, step = "bwa_alignment", "outfiles", column = "samtools_sort_bam") # symLink2bam( # sysargs = bampaths, htmldir = c("~/.html/", "somedir/"), # urlbase = "http://cluster.hpcc.ucr.edu/~tgirke/", # urlfile = "./results/IGVurl.txt" # ) # }, # step_name = "bam_urls", # dependency = "bwa_alignment", # run_step = "optional" # ) ## ----fastq2ubam, eval=FALSE, spr=TRUE--------------------- # appendStep(sal) <- SYSargsList( # step_name = "fastq2ubam", # targets = "targetsPE.txt", # wf_file = "gatk/workflow_gatk_fastq2ubam.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # inputvars = c( # FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", # SampleName = "_SampleName_" # ), # dependency = c("faidx_index") # ) ## ----merge_bam, eval=FALSE, spr=TRUE---------------------- # appendStep(sal) <- SYSargsList( # step_name = "merge_bam", # targets = c("bwa_alignment", "fastq2ubam"), # wf_file = "gatk/workflow_gatk_mergebams.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # inputvars = c( # bwa_men_sam = "_bwasam_", # ubam = "_ubam_", # SampleName = "_SampleName_" # ), # rm_targets_col = c("preprocessReads_1", "preprocessReads_2"), # dependency = c("bwa_alignment", "fastq2ubam") # ) ## ----sort, eval=FALSE, spr=TRUE--------------------------- # appendStep(sal) <- SYSargsList( # step_name = "sort", # targets = "merge_bam", # wf_file = "gatk/workflow_gatk_sort.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # inputvars = c(merge_bam = "_mergebam_", SampleName = "_SampleName_"), # rm_targets_col = c( # "bwa_men_sam", "ubam", "SampleName_fastq2ubam", # "Factor_fastq2ubam", "SampleLong_fastq2ubam", # "Experiment_fastq2ubam", "Date_fastq2ubam" # ), # dependency = c("merge_bam") # ) ## ----mark_dup, eval=FALSE, spr=TRUE----------------------- # appendStep(sal) <- SYSargsList( # step_name = "mark_dup", # targets = "sort", # wf_file = "gatk/workflow_gatk_markduplicates.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # inputvars = c(sort_bam = "_sort_", SampleName = "_SampleName_"), # rm_targets_col = c("merge_bam"), # dependency = c("sort") # ) ## ----fix_tag, eval=FALSE, spr=TRUE------------------------ # appendStep(sal) <- SYSargsList( # step_name = "fix_tag", # targets = "mark_dup", # wf_file = "gatk/workflow_gatk_fixtag.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # inputvars = c(mark_bam = "_mark_", SampleName = "_SampleName_"), # rm_targets_col = c("sort_bam"), # dependency = c("mark_dup") # ) ## ----hap_caller, eval=FALSE, spr=TRUE--------------------- # appendStep(sal) <- SYSargsList( # step_name = "hap_caller", # targets = "fix_tag", # wf_file = "gatk/workflow_gatk_haplotypecaller.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # inputvars = c(fixtag_bam = "_fixed_", SampleName = "_SampleName_"), # rm_targets_col = c("mark_bam"), # dependency = c("fix_tag") # ) ## ----import, eval=FALSE, spr=TRUE------------------------- # appendStep(sal) <- SYSargsList( # step_name = "import", # targets = NULL, dir = FALSE, # wf_file = "gatk/workflow_gatk_genomicsDBImport.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # dependency = c("hap_caller") # ) ## ----call_variants, eval=FALSE, spr=TRUE------------------ # appendStep(sal) <- SYSargsList( # step_name = "call_variants", # targets = NULL, dir = FALSE, # wf_file = "gatk/workflow_gatk_genotypeGVCFs.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # dependency = c("import") # ) ## ----filter, eval=FALSE, spr=TRUE------------------------- # appendStep(sal) <- SYSargsList( # step_name = "filter", # targets = NULL, dir = FALSE, # wf_file = "gatk/workflow_gatk_variantFiltration.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # dependency = c("call_variants") # ) ## ----create_vcf, eval=FALSE, spr=TRUE--------------------- # appendStep(sal) <- SYSargsList( # step_name = "create_vcf", # targets = "hap_caller", # wf_file = "gatk/workflow_gatk_select_variant.cwl", # input_file = "gatk/gatk.yaml", # dir_path = "param/cwl", # inputvars = c(SampleName = "_SampleName_"), # dependency = c("hap_caller", "filter") # ) ## ----create_vcf_BCFtool, eval=FALSE, spr=TRUE------------- # appendStep(sal) <- SYSargsList( # step_name = "create_vcf_BCFtool", # targets = "bwa_alignment", dir = TRUE, # wf_file = "workflow-bcftools/workflow_bcftools.cwl", # input_file = "workflow-bcftools/bcftools.yml", # dir_path = "param/cwl", # inputvars = c(bwa_men_sam = "_bwasam_", SampleName = "_SampleName_"), # rm_targets_col = c("preprocessReads_1", "preprocessReads_2"), # dependency = "bwa_alignment", # run_step = "optional" # ) ## ----inspect_vcf, eval=FALSE------------------------------ # library(VariantAnnotation) # vcf_raw <- getColumn(sal, "create_vcf") # vcf <- readVcf(vcf_raw[1], "A. thaliana") # vcf # vr <- as(vcf, "VRanges") # vr ## ----filter_vcf, eval=FALSE, spr=TRUE--------------------- # appendStep(sal) <- LineWise( # code = { # vcf_raw <- getColumn(sal, "create_vcf") # library(VariantAnnotation) # filter <- "totalDepth(vr) >= 2 & (altDepth(vr) / totalDepth(vr) >= 0.8)" # vcf_filter <- suppressWarnings(filterVars(vcf_raw, filter, organism = "A. thaliana", out_dir = "results/vcf_filter")) # # dump the filtered path variable to running enviornment so # # other sysArg steps can get its values # updateColumn(sal, 'create_vcf', "outfiles") <- data.frame(vcf_filter=vcf_filter) # }, # step_name = "filter_vcf", # dependency = "create_vcf" # ) ## ----filter_vcf_BCFtools, eval=FALSE, spr=TRUE------------ # appendStep(sal) <- LineWise( # code = { # vcf_raw <- getColumn(sal, step = "create_vcf_BCFtool", # position = "outfiles", column = "bcftools_call") # library(VariantAnnotation) # filter <- "rowSums(vr) >= 2 & (rowSums(vr[,3:4])/rowSums(vr[,1:4]) >= 0.8)" # vcf_filter_bcf <- suppressWarnings(filterVars(vcf_raw, filter, organism = "A. thaliana", out_dir = "results/vcf_filter_BCFtools", varcaller = "bcftools")) # # updateColumn(sal, 'create_vcf', "outfiles") <- data.frame(vcf_filter_bcf=vcf_filter_bcf) # }, # step_name = "filter_vcf_BCFtools", # dependency = "create_vcf_BCFtool", # run_step = "optional" # ) ## ----check_filter, eval=FALSE----------------------------- # copyEnvir(sal, "vcf_raw", globalenv()) # copyEnvir(sal, "vcf_filter", globalenv()) # length(as(readVcf(vcf_raw[1], genome = "Ath"), "VRanges")[, 1]) # length(as(readVcf(vcf_filter[1], genome = "Ath"), "VRanges")[, 1]) ## ----annotate_basics, eval=FALSE-------------------------- # library("GenomicFeatures") # # comment the next line if optional step "filter_vcf" is included # vcf_filter <- getColumn(sal, "create_vcf") # # uncomment the next line if optional step "filter_vcf" is included # # copyEnvir(sal, "vcf_filter", globalenv()) # txdb <- loadDb("./data/tair10.sqlite") # vcf <- readVcf(vcf_filter[1], "A. thaliana") # locateVariants(vcf, txdb, CodingVariants()) ## ----annotate_basics_non-synon, eval=FALSE---------------- # fa <- FaFile("data/tair10.fasta") # predictCoding(vcf, txdb, seqSource = fa) ## ----annotate_vcf, eval=FALSE, spr=TRUE------------------- # appendStep(sal) <- LineWise( # code = { # # get the filtered vcf path from R running environment # copyEnvir(sal, "vcf_filter", globalenv()) # library("GenomicFeatures") # txdb <- loadDb("./data/tair10.sqlite") # fa <- FaFile("data/tair10.fasta") # vcf_anno <- suppressMessages(suppressWarnings(variantReport(vcf_filter, txdb = txdb, fa = fa, organism = "A. thaliana", out_dir = "results/vcf_anno"))) # }, # step_name = "annotate_vcf", # dependency = "filter_vcf" # ) ## ----view_annotation, eval=FALSE-------------------------- # copyEnvir(sal, "vcf_anno", globalenv()) # read.delim(vcf_anno[1])[38:40, ] ## ----combine_var, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- LineWise( # code = { # combineDF <- combineVarReports(vcf_anno, filtercol = c(Consequence = "nonsynonymous")) # write.table(combineDF, "./results/combineDF_nonsyn.tsv", quote = FALSE, row.names = FALSE, sep = "\t") # }, # step_name = "combine_var", # dependency = "annotate_vcf" # ) ## ----summary_var, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- LineWise( # code = { # write.table(varSummary(vcf_anno), "./results/variantStats.tsv", quote = FALSE, col.names = NA, sep = "\t") # }, # step_name = "summary_var", # dependency = "combine_var" # ) ## ----venn_diagram, eval=FALSE, spr=TRUE------------------- # appendStep(sal) <- LineWise( # code = { # ## make a list of first three samples # varlist <- sapply(names(vcf_anno[1:3]), function(x) as.character(read.delim(vcf_anno[x])$VARID)) # vennset <- overLapper(varlist, type = "vennsets") # png("./results/vennplot_var.png") # vennPlot(list(vennset), mymain = "Venn Plot of First 3 Samples", mysub = "", colmode = 2, ccol = c("red", "blue")) # dev.off() # }, # step_name = "venn_diagram", # dependency = "annotate_vcf" # ) ## ----plot_variant, eval=FALSE, spr=TRUE------------------- # appendStep(sal) <- LineWise( # code = { # # get the filtered vcf path from R running environment # copyEnvir(sal, "vcf_filter", globalenv()) # library(ggbio) # library(VariantAnnotation) # mychr <- "ChrM" # mystart <- 19000 # myend <- 21000 # bams <- getColumn(sal, "fix_tag") # vcf <- suppressWarnings(readVcf(vcf_filter["M6B"], "A. thaliana")) # ga <- readGAlignments(bams["M6B"], use.names = TRUE, param = ScanBamParam(which = GRanges(mychr, IRanges(mystart, myend)))) # p1 <- autoplot(ga, geom = "rect") # p2 <- autoplot(ga, geom = "line", stat = "coverage") # p3 <- autoplot(vcf[seqnames(vcf) == mychr], type = "fixed") + # xlim(mystart, myend) + # theme(legend.position = "none", axis.text.y = element_blank(), axis.ticks.y = element_blank()) # p4 <- autoplot(loadDb("./data/tair10.sqlite"), which = GRanges(mychr, IRanges(mystart, myend)), names.expr = "gene_id") # p1_4 <- tracks(Reads = p1, Coverage = p2, Variant = p3, Transcripts = p4, heights = c(0.3, 0.2, 0.1, 0.35)) + ylab("") # ggbio::ggsave(p1_4, filename = "./results/plot_variant.png", units = "in") # }, # step_name = "plot_variant", # dependency = "filter_vcf" # ) ## ----sessionInfo, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- LineWise( # code = { # sessionInfo() # }, # step_name = "sessionInfo", # dependency = "plot_variant") ## ----runWF, eval=FALSE------------------------------------ # sal <- runWF(sal) ## ----runWF_cluster, eval=FALSE---------------------------- # # wall time in mins, memory in MB # resources <- list(conffile=".batchtools.conf.R", # template="batchtools.slurm.tmpl", # Njobs=18, # walltime=120, # ntasks=1, # ncpus=4, # memory=1024, # partition = "short" # ) # sal <- addResources(sal, c("hisat2_mapping"), resources = resources) # sal <- runWF(sal) ## ----plotWF, eval=FALSE----------------------------------- # plotWF(sal, rstudio = TRUE) ## ----statusWF, eval=FALSE--------------------------------- # sal # statusWF(sal) ## ----logsWF, eval=FALSE----------------------------------- # sal <- renderLogs(sal) ## ----list_tools------------------------------------------- if(file.exists(file.path(".SPRproject", "SYSargsList.yml"))) { local({ sal <- systemPipeR::SPRproject(resume = TRUE) systemPipeR::listCmdTools(sal) systemPipeR::listCmdModules(sal) }) } else { cat(crayon::blue$bold("Tools and modules required by this workflow are:\n")) cat(c("trimmomatic/0.39", "samtools/1.14", "gatk/4.2.0.0", "bcftools/1.15", "bwa/0.7.17"), sep = "\n") } ## ----report_session_info, eval=TRUE----------------------- sessionInfo()