## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html
)

## ----installation, eval=FALSE-------------------------------------------------
#  if(!requireNamespace('BiocManager', quietly = TRUE))
#    install.packages('BiocManager')
#  
#  BiocManager::install("HybridExpress")

## ----load_package, message=FALSE----------------------------------------------
# Load package after installation
library(HybridExpress)

set.seed(123) # for reproducibility

## ----message = FALSE----------------------------------------------------------
library(SummarizedExperiment)

# Load data
data(se_chlamy)

# Inspect the `SummarizedExperiment` object
se_chlamy

## Take a look at the colData and count matrix
colData(se_chlamy)
assay(se_chlamy) |> head()

table(se_chlamy$Ploidy, se_chlamy$Generation)

## -----------------------------------------------------------------------------
# Add midparent expression using the mean of the counts
se <- add_midparent_expression(se_chlamy)
head(assay(se))

# Alternative 1: using the sum of the counts
add_midparent_expression(se_chlamy, method = "sum") |>
    assay() |> 
    head()

# Alternative 2: using the weighted mean of the counts (weights = ploidy)
w <- c(2, 1) # P1 = diploid; P2 = haploid
add_midparent_expression(se_chlamy, method = "weightedmean", weights = w) |>
    assay() |> 
    head()

## -----------------------------------------------------------------------------
# Show last rows of the count matrix
assay(se) |> 
    tail()

## -----------------------------------------------------------------------------
# Take a look at the original colData
colData(se)

# Add size factors estimated from spike-in controls
se <- add_size_factors(se, spikein = TRUE, spikein_pattern = "ERCC")

# Take a look at the new colData
colData(se)

## -----------------------------------------------------------------------------
# For colData rows with missing values (midparent samples), add "midparent"
se$Ploidy[is.na(se$Ploidy)] <- "midparent"
se$Generation[is.na(se$Generation)] <- "midparent"

# Create PCA plot
pca_plot(se, color_by = "Generation", shape_by = "Ploidy", add_mean = TRUE)

## -----------------------------------------------------------------------------
# Plot a heatmap of sample correlations
plot_samplecor(se, coldata_cols = c("Generation", "Ploidy"))

## -----------------------------------------------------------------------------
# Get a list of differentially expressed genes for each contrast
deg_list <- get_deg_list(se)

# Inspecting the output
## Getting contrast names
names(deg_list)

## Accessing gene-wise test statistics for contrast `F1_vs_P1`
head(deg_list$F1_vs_P1)

## Counting the number of DEGs per contrast
sapply(deg_list, nrow)

## -----------------------------------------------------------------------------
# Get a data frame with DEG frequencies for each contrast
deg_counts <- get_deg_counts(deg_list)

deg_counts

## -----------------------------------------------------------------------------
attributes(deg_list)$ngenes

## -----------------------------------------------------------------------------
# Total number of genes in the C. reinhardtii genome (v6.1): 16883
attributes(deg_list)$ngenes <- 16883

## -----------------------------------------------------------------------------
deg_counts <- get_deg_counts(deg_list)
deg_counts

## -----------------------------------------------------------------------------
# Plot expression triangle
plot_expression_triangle(deg_counts)

## ----fig.height=5-------------------------------------------------------------
# Create vectors (length 4) of colors and box labels
pal <- c("springgreen4", "darkorange3", "mediumpurple4", "mediumpurple3")
labels <- c("Parent 1\n(2n)", "Parent 2\n(n)", "Progeny\n(3n)", "Midparent")

plot_expression_triangle(deg_counts, palette = pal, box_labels = labels)

## -----------------------------------------------------------------------------
# Classify genes in expression partitions
exp_partitions <- expression_partitioning(deg_list)

# Inspect the output
head(exp_partitions)

# Count number of genes per category
table(exp_partitions$Category)

# Count number of genes per class
table(exp_partitions$Class)

## ----fig.height=6, fig.width=8------------------------------------------------
# Plot partitions as a scatter plot of divergences
plot_expression_partitions(exp_partitions, group_by = "Category")

## ----fig.height=7, fig.width=8------------------------------------------------
# Group by `Class`
plot_expression_partitions(exp_partitions, group_by = "Class")

## ----fig.height=7-------------------------------------------------------------
# Visualize frequency of genes in each partition
## By `Category` (default)
plot_partition_frequencies(exp_partitions)

## By `Class`
plot_partition_frequencies(exp_partitions, group_by = "Class")

## -----------------------------------------------------------------------------
# Load example functional annotation (GO terms)
data(go_chlamy)

head(go_chlamy)

## -----------------------------------------------------------------------------
# Get a vector of genes in class "ADD"
genes_add <- exp_partitions$Gene[exp_partitions$Class == "ADD"]
head(genes_add)

# Get background genes - genes in the count matrix
bg <- rownames(se)

# Perform overrepresentation analysis
ora_add <- ora(genes_add, go_chlamy, background = bg)

# Inspect results
head(ora_add)

## -----------------------------------------------------------------------------
# Create a list of genes in each class
genes_by_class <- split(exp_partitions$Gene, exp_partitions$Class)

names(genes_by_class)
head(genes_by_class$UP)

# Iterate through each class and perform ORA
ora_classes <- lapply(
    genes_by_class, # list through which we will iterate
    ora, # function we will apply to each element of the list
    annotation = go_chlamy, background = bg # additional arguments to function
)

# Inspect output
ora_classes

## -----------------------------------------------------------------------------
# Get up-regulated genes for each contrast
up_genes <- lapply(deg_list, function(x) rownames(x[x$log2FoldChange > 0, ]))
names(up_genes)
head(up_genes$F1_vs_P1)

# Perform ORA
ora_up <- lapply(
    up_genes,
    ora,
    annotation = go_chlamy, background = bg
)

ora_up

## -----------------------------------------------------------------------------
# Get count matrix
count_matrix <- assay(se_chlamy)
head(count_matrix)

# Get colData (data frame of sample metadata)
coldata <- colData(se_chlamy)
head(coldata)

## -----------------------------------------------------------------------------
# Create a SummarizedExperiment object
new_se <- SummarizedExperiment(
    assays = list(counts = count_matrix),
    colData = coldata
)

new_se

## ----echo = FALSE-------------------------------------------------------------
sessioninfo::session_info()