## ----style, echo = FALSE, results = 'asis'--------------------------------- BiocStyle::markdown() ## ----global_options, include=FALSE-------------------------------------------------------------------------- knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE) options(width=110) set.seed(123) ## ----------------------------------------------------------------------------------------------------------- ## load ABAEnrichment package library(ABAEnrichment) ## create input data.frame with candidate genes gene_ids = c('NCAPG', 'APOL4', 'NGFR', 'NXPH4', 'C21orf59', 'CACNG2', 'AGTR1', 'ANO1', 'BTBD3', 'MTUS1', 'CALB1', 'GYG1', 'PAX2') input_hyper = data.frame(gene_ids, is_candidate=1) head(input_hyper) ## ---- eval=FALSE-------------------------------------------------------------------------------------------- # ## run enrichment analyses with default parameters # ## for the adult and developing human brain # res_adult = aba_enrich(input_hyper, dataset='adult') # res_devel = aba_enrich(input_hyper, dataset='5_stages') ## ----results='hide'----------------------------------------------------------------------------------------- ## run enrichment analysis with less cutoffs and randomsets ## to save computation time res_devel = aba_enrich(input_hyper, dataset='5_stages', cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100) ## ----------------------------------------------------------------------------------------------------------- ## extract first element from the output list, which contains the statistics fwers_devel = res_devel[[1]] ## see results for the brain regions with highest enrichment ## for children (3-11 yrs, age_category 3) fwers_3 = fwers_devel[fwers_devel[,1]==3, ] head(fwers_3) ## ----------------------------------------------------------------------------------------------------------- res_devel[2:3] ## ----eval=FALSE--------------------------------------------------------------------------------------------- # ## run enrichment analysis, with randomsets dependent on gene length # res_len = aba_enrich(input_hyper, gene_len=TRUE) # ## run the same analysis using gene-coordinates # ## from GRCh38 instead of the default GRCh37 # res_len_grch38 = aba_enrich(input_hyper, gene_len=TRUE, ref_genome='grch38') ## ----------------------------------------------------------------------------------------------------------- ## create input vector with a candidate region on chromosome 8 ## and background regions on chromosome 7, 8 and 9 regions = c('8:82000000-83000000', '7:1300000-56800000', '7:74900000-148700000', '8:7400000-44300000', '8:47600000-146300000', '9:0-39200000', '9:69700000-140200000') is_candidate = c(1, rep(0,6)) input_region = data.frame(regions, is_candidate) ## ----results='hide'----------------------------------------------------------------------------------------- ## run enrichment analysis for the adult human brain res_region = aba_enrich(input_region, dataset='adult', cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100) ## ----------------------------------------------------------------------------------------------------------- ## look at the results from the enrichment analysis fwers_region = res_region[[1]] head(fwers_region) ## see which genes are located in the candidate region input_genes = res_region[[2]] candidate_genes = input_genes[input_genes[,2]==1,] candidate_genes ## ----echo=FALSE--------------------------------------------------------------------------------------------- gene = c('NCAPG','APOL4','NGFR','NXPH4','C21orf59','CACNG2') chr = c('chr4', 'chr22', 'chr17', 'chr12', 'chr21', 'chr22') start = c(17812436, 36585176, 47572655, 57610578, 33954510, 36956916) end = c(17846487, 36600879, 47592382, 57620232, 33984918, 37098690) custom_coords = data.frame(gene, chr, start, end, stringsAsFactors=FALSE) ## ----------------------------------------------------------------------------------------------------------- ## example for a dataframe with custom gene-coordinates head(custom_coords) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # ## use correction for gene-length based on custom gene-coordinates # res_len_cc = aba_enrich(input_hyper, gene_len=TRUE, gene_coords=custom_coords) ## ----------------------------------------------------------------------------------------------------------- ## assign random scores to the genes used above scores = sample(1:50, length(gene_ids)) input_wicox = data.frame(gene_ids, scores) head(input_wicox) ## ---- results='hide'---------------------------------------------------------------------------------------- ## test for enrichment of expressed genes with high scores in the adult brain ## using the Wilcoxon rank-sum test res_wilcox = aba_enrich(input_wicox, test='wilcoxon', cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100) ## ----------------------------------------------------------------------------------------------------------- head(res_wilcox[[1]]) ## ----------------------------------------------------------------------------------------------------------- ## create a toy example dataset with two counts per gene high_A_genes = c('RFFL', 'NTS', 'LIPE', 'GALNT6', 'GSN', 'BTBD16', 'CERS2') low_A_genes = c('GDA', 'ENC1', 'EGR4', 'VIPR1', 'DOC2A', 'OASL', 'FRY', 'NAV3') A_counts = c(sample(20:30, length(high_A_genes)), sample(5:15, length(low_A_genes))) B_counts = c(sample(5:15, length(high_A_genes)), sample(20:30, length(low_A_genes))) input_binom = data.frame(gene_ids=c(high_A_genes, low_A_genes), A_counts, B_counts) head(input_binom) ## ----------------------------------------------------------------------------------------------------------- ## run binomial test res_binom = aba_enrich(input_binom, cutoff_quantiles=c(0.2,0.9), test='binomial', n_randsets=100, silent=TRUE) head(res_binom[[1]]) ## ----------------------------------------------------------------------------------------------------------- ## create a toy example with four counts per gene high_substi_genes = c('RFFL', 'NTS', 'LIPE', 'GALNT6', 'GSN', 'BTBD16', 'CERS2') low_substi_genes = c('ENC1', 'EGR4', 'NPTX1', 'DOC2A', 'OASL', 'FRY', 'NAV3') subs_non_syn = c(sample(5:15, length(high_substi_genes), replace=TRUE), sample(0:5, length(low_substi_genes), replace=TRUE)) subs_syn = sample(5:15, length(c(high_substi_genes, low_substi_genes)), replace=TRUE) vari_non_syn = c(sample(0:5, length(high_substi_genes), replace=TRUE), sample(0:10, length(low_substi_genes), replace=TRUE)) vari_syn = sample(5:15, length(c(high_substi_genes, low_substi_genes)), replace=TRUE) input_conti = data.frame(gene_ids=c(high_substi_genes, low_substi_genes), subs_non_syn, subs_syn, vari_non_syn, vari_syn) head(input_conti) ## the corresponding contingency table for the first gene would be: matrix(input_conti[1, 2:5], ncol=2, dimnames=list(c('non-synonymous', 'synonymous'), c('substitution','variable'))) ## ---- results='hide'---------------------------------------------------------------------------------------- res_conti = aba_enrich(input_conti, test='contingency', cutoff_quantiles=c(0.7,0.8,0.9), n_randset=100) ## ----------------------------------------------------------------------------------------------------------- head(res_conti[[1]]) ## ----------------------------------------------------------------------------------------------------------- ## get expression data for the top 5 regions ## and all input genes ## of the last aba_enrich analysis (res_conti) top_regions = res_conti[[1]][1:5, 'structure_id'] gene_ids = res_conti[[2]][,1] expr = get_expression(structure_ids=top_regions, gene_ids=gene_ids, dataset='adult') expr[1:6,1:6] ## ----------------------------------------------------------------------------------------------------------- get_expression(structure_ids=c('Allen:10657','Allen:10208'), gene_ids=c('RFFL', 'NTS', 'LIPE'), dataset='5_stages') ## ----------------------------------------------------------------------------------------------------------- ## plot the expression data from above plot_expression(expr, main="microarray expression data for adult brain") ## ----------------------------------------------------------------------------------------------------------- ## plot expression data for the top 5 regions ## of age-category 2 and all input genes ## of the developmental stage aba_enrich analysis above (res_devel) ## extract brain regions devel_stats = res_devel[[1]] devel_stats_2 = devel_stats[devel_stats$age_category==2,] top_regions_2 = devel_stats_2[1:5,'structure_id'] ## extract genes genes = res_devel[[2]][,1] ## get expression for all 5 age categories expr_all = get_expression(top_regions_2, genes, "5_stages") ## subset to age-cateogry 2 expr_2 = expr_all[["age_category_2"]] ## plot heatmap plot_expression(expr_2, main="RPKM from RNA-seq for age_category_2") ## ----------------------------------------------------------------------------------------------------------- ## plot expression data for the top 5 regions ## and all input genes ## from the 2x2-contingency table analysis (res_conti) ## extract brain regions top_regions = res_conti[[1]][1:5, 'structure_id'] ## extract genes gene_ids = res_conti[[2]][,1] ## get expression expr = get_expression(structure_ids=top_regions, gene_ids=gene_ids, dataset='adult') ## plot expression with sidebar plot_expression(expr, gene_vars=res_conti[[2]], main="heatmap with sidebar") ## ----------------------------------------------------------------------------------------------------------- ## use heatmap.2 to allow adjusting other parameters like color gplots::heatmap.2(expr, scale="none", col=gplots::greenred(100), main="custom heatmap", trace="none", keysize=1.4, key.xlab="expression", dendrogram="row", margins=c(6, 8)) ## ----------------------------------------------------------------------------------------------------------- ## get IDs of the substructures of the frontal neocortex (Allen:10161) ## for which expression data are available subs = get_sampled_substructures('Allen:10161') subs ## get the full name of those substructures get_name(subs) ## get the superstructures of the frontal neocortex (from general to special) supers = get_superstructures('Allen:10161') supers ## get the full names of those superstructures get_name(supers) ## ----------------------------------------------------------------------------------------------------------- ## get structure IDs of brain regions that contain 'accumbens' in their name get_id('accumbens') ## get structure IDs of brain regions that contain 'telencephalon' in their name get_id('telencephalon') ## ----------------------------------------------------------------------------------------------------------- ## get all brain regions included in ABAEnrichment together with thier IDs all_regions = get_id('') head(all_regions) ## ---- results='hide'---------------------------------------------------------------------------------------- ## run an enrichment analysis with 7 candidate and 7 background genes ## for the developing brain gene_ids = c('FOXJ1', 'NTS', 'LTBP1', 'STON2', 'KCNJ6', 'AGT', 'ANO3', 'TTR', 'ELAVL4', 'BEAN1', 'TOX', 'EPN3', 'PAX2', 'KLHL1') is_candidate = rep(c(1,0), each=7) input_hyper = data.frame(gene_ids, is_candidate) res = aba_enrich(input_hyper, dataset='5_stages', cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100) ## ----------------------------------------------------------------------------------------------------------- head(res[[1]]) ## see which candidate genes are annotated to brain regions with a FWER < 0.01 anno = get_annotated_genes(res, fwer_threshold=0.1) anno ## ----------------------------------------------------------------------------------------------------------- ## find out which of the above genes have expression ## above the 70% and 90% expression-cutoff ## in the basal ganglia of the adult human brain (Allen:4276) anno2 = get_annotated_genes(structure_ids='Allen:4276', dataset='adult', cutoff_quantiles=c(0.7,0.9), genes=gene_ids) anno2 ## ----------------------------------------------------------------------------------------------------------- ## use previously defined input genes dataframe head(input_hyper) ## ----results='hide'----------------------------------------------------------------------------------------- ## run enrichment analysis with developmental effect score res_dev_effect = aba_enrich(input_hyper, dataset='dev_effect', cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100) ## ----------------------------------------------------------------------------------------------------------- ## see the 5 brain regions with the lowest FWERs top_regions = res_dev_effect[[1]][1:5, ] top_regions ## ---- eval=F------------------------------------------------------------------------------------------------ # ## plot developmental effect score of the 5 brain regions with the lowest FWERs # plot_expression(top_regions[ ,'structure_id']) ## ----------------------------------------------------------------------------------------------------------- sessionInfo()