NB: the 4th SubSeries of data is ommited because it features longitudinal data
Please install the following packages that will be useful during this practical:
dplyr
flashClust
edgeR
limma
UpSetR
WARNING: some of those are from Bioconductor
library(dplyr)
library(edgeR)
library(limma)
library(UpSetR)
library(flashClust)
This practicals will be based on the public data provided alongside the (Singhania et al. 2018) article on the Gene Expression Omnibus website data repository under the superserie accession number GSE107995
and that studied the transcriptional signature of active human tuberculosis (TB) infection.
0.a) first download both the raw and
edgeR
-normalizedxlsx
files from GEO for all 3 subseries:
GSE107991
GSE107992
GSE107993
0.b) read through the code below and try to understand the operation performed. In particular, have a look at the
GSE107991_metadata
object (tip: it is aS4 object
, so use the@
operator)
# DATA ----
rm(list = ls())
library(GEOquery)
library(readxl)
London_norm <- read_excel("GSE107991_edgeR_normalized_Berry_London.xlsx") # edgeR preprocessed
SouthAfrica_norm <- read_excel("GSE107992_edgeR_normalized_Berry_SouthAfrica.xlsx")
Leicester_norm <- read_excel("GSE107993_edgeR_normalized_Leicester_non_progressor_longitudnal_only.xlsx") # raw counts
London_raw <- read_excel("GSE107991_Raw_counts_Berry_London.xlsx") # raw counts
SouthAfrica_raw <- read_excel("GSE107992_Raw_counts_Berry_SouthAfrica.xlsx") # raw counts
Leicester_raw <- read_excel("GSE107993_Raw_counts_Leicester_non_progressor_longitudnal_only.xlsx") # raw counts
# metadata
GSE107991_metadata <- GEOquery::getGEO("GSE107991", GSEMatrix = FALSE) #London
GSE107992_metadata <- GEOquery::getGEO("GSE107992", GSEMatrix = FALSE) # SouthAfrica
GSE107993_metadata <- GEOquery::getGEO("GSE107993", GSEMatrix = FALSE) #Leicester
get_info <- function(i, meta) {
name <- meta@gsms[[i]]@header$source_name_ch1
name <- gsub("Active_TB", "ActiveTB", name)
name <- gsub("Test_set", "TestSet", name)
name <- gsub("Validation_set", "ValidationSet", name)
name <- gsub("Non_progressor", "NonProgressor", name)
res <- unlist(strsplit(name, split = "_"))
res <- c(res, meta@gsms[[i]]@header$title)
res
}
infos_london <- sapply(1:length(GSE107991_metadata@gsms), FUN = get_info, meta = GSE107991_metadata)
infos_df_london <- cbind.data.frame(GSMID = names(GSE107991_metadata@gsms),
t(infos_london))
rownames(infos_df_london) <- names(GSE107991_metadata@gsms)
colnames(infos_df_london)[-1] <- c("Cohort", "Location", "Set", "Status", "SampleName")
infos_sf <- sapply(1:length(GSE107992_metadata@gsms), FUN = get_info, meta = GSE107992_metadata)
infos_df_sf <- cbind.data.frame(GSMID = names(GSE107992_metadata@gsms), t(infos_sf))
rownames(infos_df_sf) <- names(GSE107992_metadata@gsms)
colnames(infos_df_sf)[-1] <- c("Cohort", "Location", "Set", "Status", "SampleName")
infos_leicester <- sapply(1:length(GSE107993_metadata@gsms), FUN = get_info,
meta = GSE107993_metadata)
infos_df_leicester <- cbind.data.frame(GSMID = names(GSE107993_metadata@gsms),
t(infos_leicester))
rownames(infos_df_leicester) <- names(GSE107993_metadata@gsms)
colnames(infos_df_leicester)[-1] <- c("Cohort", "Location", "Status", "Set",
"SampleName")
infos_df_leicester <- infos_df_leicester[, c(1, 2, 3, 5, 4, 6)]
infos_singhania <- rbind.data.frame(infos_df_london, infos_df_sf, infos_df_leicester)
expr_singhania_raw <- merge(x = merge(x = London_raw, y = SouthAfrica_raw, by = c(1:3)),
y = Leicester_raw, by = c(1:3))
expr_singhania_norm <- merge(x = merge(x = London_norm, y = SouthAfrica_norm,
by = c(1:3)), y = Leicester_norm, by = c(1:3))
save(infos_singhania, expr_singhania_raw, expr_singhania_norm, London_norm,
SouthAfrica_norm, Leicester_norm, file = "SinghaniaTB_data.Rdata")
Now let’s start from the data set expr_singhania_raw
and replicate the normalization steps that were performed in (Singhania et al. 2018).
The counts matrix outputted from the alignement usually contains many genes that are never observed.
1.a) remove the genes that are never observed, i.e. with 0 counts in every samples using the
dplyr::select()
,rowSums()
anddplyr::filter
() functions.
dim(expr_singhania_raw)
## [1] 58051 242
# removing genes never observed
gene_no0 <- rowSums(dplyr::select(expr_singhania_raw, -dplyr::starts_with("Gene"))) >
0
table(gene_no0)
## gene_no0
## FALSE TRUE
## 6443 51608
expr_singhania_raw_no0 <- dplyr::filter(expr_singhania_raw, gene_no0)
dim(expr_singhania_raw_no0)
## [1] 51608 242
Low counts are still badly understood by the community and their measure is less reliable than higher counts, therefore it is common to remove genes with very low counts. In particular, Singhania et al considered only genes “expressed with counts per million (CPM) > 2 in at least five samples” (such filters can seem a bit ad hoc).
1.b) Compute the counts per million and keep only the genes that have a cpm>2 in at least 5 samples.
NB: \(cpm(x_{ig}) = \dfrac{x_{ig} \times 1,000,000}{\sum_{g=1}^Gx_{ig}}\) with \(x_g\) the observed raw count for the gene \(g\) in sample \(i\).
mycpmfun <- function(x) {
return((x/sum(x)) * 10^6)
}
cpm <- apply(dplyr::select(expr_singhania_raw_no0, -starts_with("Gene")), MARGIN = 2,
FUN = mycpmfun)
expr_singhania_raw_no0_cpm2p <- dplyr::filter(expr_singhania_raw_no0, rowSums(cpm >
2) > 5)
dim(expr_singhania_raw_no0_cpm2p)
## [1] 15190 242
1.c) Finally keep only the protein coding genes (i.e. for which
Gene_biotype
isprotein_coding
)
gene_pc <- expr_singhania_raw_no0_cpm2p$Gene_biotype == "protein_coding"
table(gene_pc)
## gene_pc
## FALSE TRUE
## 2763 12427
expr_singhania_raw_no0_cpm2p_prot <- filter(expr_singhania_raw_no0_cpm2p, gene_pc)
dim(expr_singhania_raw_no0_cpm2p_prot)
## [1] 12427 242
2.a) Display the boxplots of each samples \(log2\) raw counts (with the
log2()
function) alongside each other, as well as a barpot of the library sizes for each sample, colored by cohort location (tip: use “baseR
” graphics). Comment the need for normalization.
bp_col <- infos_singhania$Location
levels(bp_col) <- c("blue2", "gold", "green2")
boxplot(log2(select(expr_singhania_raw_no0_cpm2p_prot, -starts_with("Gene"))),
col = as.character(bp_col), xlab = "Samples", ylab = "log2(raw counts)",
axes = FALSE)
axis(2)
box()
legend("topright", c("London", "South Africa", "Leicester"), col = c("blue3",
"gold", "green2"), pch = 15, horiz = TRUE, bg = NA)
barplot(colSums(select(expr_singhania_raw_no0_cpm2p_prot, -starts_with("Gene"))),
col = as.character(bp_col), xlab = "Samples", ylab = "Library size", names.arg = "")
legend("topright", c("London", "South Africa", "Leicester"), col = c("blue3",
"gold", "green2"), pch = 15, horiz = FALSE, bg = NA)
The sequencing depth problem is commonly treated with the TMM normalization. This normalization divides the raw count by the library size.
2.b) use the
edgeR::calcNormFactors()
function to compute the TMM normalization after converting the filtered count matrix to aDGEList
object
singhaniaDGE <- DGEList(counts = select(expr_singhania_raw_no0_cpm2p_prot, -starts_with("Gene")),
samples = infos_singhania, genes = select(expr_singhania_raw_no0_cpm2p_prot,
starts_with("Gene")))
singhaniaDGE_normfact <- edgeR::calcNormFactors(singhaniaDGE, method = "TMM")
2.c) use the
edgeR::cpm()
function to return the filtered, TMM normalized, log2-Counts per million.
expr_singhania_no0_cpm2p_prot_TMMlog2cpm <- edgeR::cpm(singhaniaDGE_normfact,
log = TRUE)
2.d) Compare the obtained values to the one shared publicly.
dim(expr_singhania_no0_cpm2p_prot_TMMlog2cpm)
## [1] 12427 239
dim(expr_singhania_norm)
## [1] 13441 242
expr_singhania_no0_cpm2p_prot_TMMlog2cpm[1:5, 1:4]
Berry_London_Sample1 | Berry_London_Sample2 | Berry_London_Sample3 | Berry_London_Sample4 |
---|---|---|---|
4.650241 | 4.164666 | 3.882906 | 4.606822 |
4.687757 | 4.603745 | 4.352406 | 4.709899 |
2.396327 | 2.096244 | 2.238437 | 1.914652 |
10.272096 | 9.998178 | 10.107998 | 9.845746 |
1.951596 | 1.866684 | 2.173530 | 1.558944 |
expr_singhania_norm[1:5, 4:7]
Berry_London_Sample1 | Berry_London_Sample2 | Berry_London_Sample3 | Berry_London_Sample4 |
---|---|---|---|
4.647009 | 4.193344 | 3.887871 | 4.603721 |
4.684619 | 4.633853 | 4.358676 | 4.707055 |
2.379097 | 2.107392 | 2.233367 | 1.891003 |
10.272479 | 10.032204 | 10.117594 | 9.846276 |
1.927926 | 1.873832 | 2.167779 | 1.528433 |
2.e) Display the boxplots of each samples \(log2\)-cpm normalized expressions alongside each other and comment.
bp_col <- infos_singhania$Location
levels(bp_col) <- c("blue2", "gold", "green2")
boxplot(expr_singhania_no0_cpm2p_prot_TMMlog2cpm, col = as.character(bp_col),
xlab = "Samples", ylab = "Normalized log2 cpm", axes = FALSE)
axis(2)
box()
legend("topright", c("London", "South Africa", "Leicester"), col = c("blue3",
"gold", "green2"), pch = 15, horiz = TRUE, bg = NA)
2.f) Display the dendrogram of the hierarchical clusterings using Wards method and the Euclidean distance comparing the raw counts and the normalized \(log2\)-cpm.
deucl_raw <- dist(t(select(expr_singhania_raw_no0_cpm2p_prot, -starts_with("Gene"))))
deucl_norm <- dist(t(expr_singhania_no0_cpm2p_prot_TMMlog2cpm))
plot(flashClust::hclust(deucl_raw, method = "ward"), labels = infos_singhania$Location)
plot(flashClust::hclust(deucl_norm, method = "ward"), labels = infos_singhania$Location)
Let’s now focus on the first analysis performed by Singhania et al., i.e. the analysis of the London Berry cohort alone, pooling together Latent TB
and Controls
to be compared to the Active TB
individuals.
3.a) Formulate the research question of interest here.
Which genes are significantly differnetially expressed bewteen the Active TB patients on the one hand, and the other patients (LTBI and Controls pooled together) in the London test dataset ?
3.b) With the function
model.matrix()
generate a model matrix for the (generalized) linear model that you will use to answer that question.
singhaniaLondonDGE_normfact <- singhaniaDGE_normfact[, singhaniaDGE_normfact$samples$Location ==
"London"]
design <- as.data.frame(model.matrix(~Status, data = infos_singhania[infos_singhania$Location ==
"London", ]))
design <- mutate(design, ActiveTB = 1 - (StatusLTBI + StatusControl))
design <- select(design, -starts_with("Status"))
edgeR
pipeline(Robinson, McCarthy, and Smyth 2010)
4.a) Using
edgeR::estimateCommonDisp()
add a (common) dispertion estimate to the DGE object
singhaniaLondonDGE_normfact_commondisp <- edgeR::estimateCommonDisp(singhaniaLondonDGE_normfact)
4.b) Using
edgeR::glmFit()
andedgeR::glmLRT()
functions, identify the differentially expressed genes (adjusted p-value < 5% and | log-FoldChange | > 1 )
fit <- glmFit(singhaniaLondonDGE_normfact_commondisp, design)
lrt <- glmLRT(fit, coef = 2)
signif_edgeR <- decideTestsDGE(lrt, adjust.method = "BH", p.value = 0.05, lfc = 1)
summary(signif_edgeR)
## ActiveTB
## Down 49
## NotSig 12168
## Up 210
genelist_edgeR <- singhaniaLondonDGE_normfact_commondisp$genes$Gene_name[as.logical(signif_edgeR)]
voom
-limma
pipeline(Smyth et al. 2002)
5.a) compute the precision weights with the
limma::voom()
function.
# estimate the heteroscedasticity weights
v <- voom(singhaniaLondonDGE_normfact, design, plot = TRUE)
5.b) now apply the
limma
pipeline with the functionslimma::lmFit()
,limma::eBayes()
andlimma::decideTests()
.
# fit the gaussian linear model with precision weights v
fit <- lmFit(v, design)
# Empirical Bayes test
fit <- eBayes(fit, robust = TRUE)
voomlimma_signif <- decideTests(fit, adjust.method = "BH", p.value = 0.05, lfc = 1)
summary(voomlimma_signif)
## (Intercept) ActiveTB
## Down 182 72
## NotSig 1124 12176
## Up 11121 179
genelist_voomlimma <- singhaniaLondonDGE_normfact_commondisp$genes$Gene_name[as.logical(voomlimma_signif)]
6.a) Using an Up-Set diagramm (Conway, Lex, and Gehlenborg 2017) with the
UpSetR
package, compare the results betweenedgeR
andvoom
-limma
upset_data <- cbind.data.frame(voomlimma = abs(voomlimma_signif[, 2]), edgeR = abs(signif_edgeR[,
1]))
upset(data = upset_data, main.bar.color = c("dodgerblue3", "red3", "purple4"),
sets.bar.color = c("red3", "dodgerblue3"))
6.b) Compare to the results from the Singhania et al. original article (in particular to the 373 gene signature). Can you list the elements that differed in their analysis compared to what you just did here ?
Conway, Jake R, Alexander Lex, and Nils Gehlenborg. 2017. “UpSetR: An R Package for the Visualization of Intersecting Sets and Their Properties.” Bioinformatics 33 (18): 2938–40.
Robinson, Mark D, Davis J McCarthy, and Gordon K Smyth. 2010. “EdgeR: A Bioconductor Package for Differential Expression Analysis of Digital Gene Expression Data.” Bioinformatics 26 (1): 139–40.
Singhania, Akul, Raman Verma, Christine M Graham, Jo Lee, Trang Tran, Matthew Richardson, Patrick Lecine, et al. 2018. “A Modular Transcriptional Signature Identifies Phenotypic Heterogeneity of Human Tuberculosis Infection.” Nature Communications 9 (1): 2308.
Smyth, Gordon K, Matthew Ritchie, Natalie Thorne, James Wettenhall, Wei Shi, and Yifang Hu. 2002. “Limma: Linear Models for Microarray and Rna-Seq Data User’s Guide.”