WGCNA Attempt 1 for ceabigr project exon expression data
Today I played around with WGCNA analyses to look at exon expression data for the ceabigr project.
Overview
We tried using a WGCNA to capture patterns of relative expression across exons. Steven provided data averaged across all individuals that was in the form of a matrix with relative change in expression for each exon (2-6) relative to exon 1.
Here is the code I used today and example outputs.
library(BiocManager)
#BiocManager::install("WGCNA")
library(WGCNA)
#library(kableExtra)
# library(DESeq2)
# library(pheatmap)
# library(RColorBrewer)
# library(data.table)
#library(DT)
# library(Biostrings)
#library(methylKit)
library(WGCNA)
library(data.table) # for data manipulation
library(tidyverse)
knitr::opts_chunk$set(
echo = TRUE, # Display code chunks
eval = TRUE, # Evaluate code chunks
warning = TRUE, # Hide warnings
message = TRUE, # Hide messages
fig.width = 6, # Set plot width in inches
fig.height = 4, # Set plot height in inches
fig.align = "center" # Align plots to the center
)
Read in data
#datExpr <- fread("output/68-female-exon-fold/logfc.txt")
datExpr <- fread("output/68-female-exon-fold/female_merge.txt")
rownames(datExpr) <- datExpr$GeneID
names(datExpr)
genes<-datExpr$GeneID
datExpr <- datExpr[ , -1, with = FALSE]
result_vector <- genes[!grepl("^L", genes, ignore.case = TRUE)]
genes <- genes[genes != "GeneID"]
Transpose to put “samples” or exon in rows and genes in columns. Put gene ID in row names.
datExpr <- t(datExpr)
datExpr<-as.data.frame(datExpr)
names(datExpr)<-genes
str(datExpr)
datExpr[] <- lapply(datExpr, as.numeric)
names(datExpr)<-genes
Choose a soft power.
powers = c(1:10)
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
plot(sft$fitIndices[,1], -log(sft$fitIndices[,3]), pch = 19, xlab="Soft Threshold (power)", ylab="-log10 scale free topology model fit", type="n")
text(sft$fitIndices[,1], -log(sft$fitIndices[,3]), labels=powers, cex=0.5)
abline(h = -log(0.9), col = "red")
Blockwise modules network construction
Run blockwise modules with a signed network.
picked_power = 8
temp_cor <- cor
cor <- WGCNA::cor # Force it to use WGCNA cor function (fix a namespace conflict issue)
netwk <- blockwiseModules(datExpr, # <= input here
# == Adjacency Function ==
power = picked_power, # <= power here
networkType = "signed",
# == Tree and Block Options ==
deepSplit = 2,
pamRespectsDendro = F,
# detectCutHeight = 0.75,
minModuleSize = 1000,
maxBlockSize = 10000,
# == Module Adjustments ==
mergeCutHeight = 0.05,
reassignThreshold = 1e-6,
minCoreKME = 0.5,
minKMEtoStay = 0.3,
# == TOM == Archive the run results in TOM file (saves time) but it doesn't save a file
saveTOMs = F,
saveTOMFileBase = "ER",
# == Output Options
numericLabels = T,
verbose = 3)
cor <- temp_cor # Return cor function to original namespace
# Identify labels as numbers
mergedColors = netwk$colors
# Plot the dendrogram and the module colors underneath
table(mergedColors)
membership<-as.data.frame(mergedColors)
membership$gene<-rownames(membership)
names(membership)<-c("module", "gene")
Plot module eigengene level
Next plot eigengene levels
#Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = mergedColors, softPower = 8)
MEs = MEList$eigengenes
ncol(MEs) #How many modules do we have now?
table(mergedColors)
table<-as.data.frame(table(mergedColors))
table
The table shows number of genes in each module.
module no. genes
0 713
1 3076
2 2926
3 2471
4 2115
5 2031
6 1854
7 1819
8 1810
9 1774
10 1680
11 1504
12 1474
13 1313
Plot module expression across exon location.
head(MEs)
names(MEs)
Strader_MEs <- MEs
Strader_MEs$exon <- c("2", "3", "4", "5", "6")
head(Strader_MEs)
plot_MEs<-Strader_MEs%>%
gather(., key="Module", value="Mean", ME0:ME13)
Plot module expression across exon location.
library(ggplot2)
library(tidyverse)
expression_plot<-plot_MEs%>%
group_by(Module, exon) %>%
ggplot(aes(x=exon, y=Mean)) +
facet_wrap(~Module)+
geom_hline(yintercept = 0, linetype="dashed", color = "grey")+
geom_point() +
geom_line(group=1)+
#ylim(-0.5,1) +
ylab("Mean Module Eigenegene") +
theme_classic(); expression_plot
ggsave(plot=expression_plot, filename="output/69-wgcna/module-expression.png", width=12, height=12)
Identify modules that have only 1 instance of a gene (if we see only 1 instance, that means a gene moved between modules by treatment)
# Extract the substring before the underscore in the gene column
membership <- membership %>%
mutate(gene_prefix = str_extract(gene, "^[^_]+"))
# Identify duplicate gene prefixes within the same module
unique_within_module <- membership %>%
group_by(module, gene_prefix) %>%
filter(n() == 1) %>%
arrange(module, gene_prefix)
unique_within_module
length(unique(unique_within_module$gene_prefix))
There are 12,223 genes that are in different modules by treatment.
# Identify duplicate gene prefixes within the same module
same_within_module <- membership %>%
group_by(module, gene_prefix) %>%
filter(n() == 2) %>%
arrange(module, gene_prefix)
same_within_module
length(unique(same_within_module$gene_prefix))
There are 1058 genes that are in the same module by treatment.
Number of unique genes in module 0.
test<-membership%>%
filter(module==0)
length(unique(test$gene_prefix))
703 unique genes
Overlay plot of every individual gene expression level over the module mean
In the plot below, adjust “target_module”, and “target_module2” to include the module number you are interested in.
Module 0
target_module<-0
target_genes<-membership%>%filter(module==target_module)%>%select(gene)
target_genes<-unique(target_genes$gene)
# Select columns in datExpr that match any item in target_genes
selected_data <- datExpr[, colnames(datExpr) %in% target_genes, drop = FALSE]
selected_data$exon<-c("2", "3", "4", "5", "6")
selected_data<-selected_data %>%
pivot_longer(cols = -exon,
names_to = "gene",
values_to = "value")%>%
#add color for exposed vs control
mutate(treatment = ifelse(grepl("exp", gene, ignore.case = TRUE), "exposed", "control"))
#add in mean module expression value from module dataframe
target_module2<-"ME0"
selected_module<-plot_MEs%>%
filter(Module==target_module2)
target_plot<-ggplot() +
ggtitle(paste("Module ", target_module))+
geom_hline(yintercept = 0, linetype="dashed", color = "grey")+
geom_line(data=selected_data, aes(x=exon, y=value, group=gene, color=treatment))+
geom_point(data=selected_data, aes(x=exon, y=value, color=treatment)) +
geom_line(data=selected_module, aes(x=exon, y=Mean, group=1), color="black", size=2)+
geom_point(data=selected_module, aes(x=exon, y=Mean), color="black", size=4) +
scale_color_manual(values=c("gray", "red"), name="Treatment")+
#ylim(-5,5) +
ylab("Mean Expression Relative to Exon 1") +
xlab("Exon")+
theme_classic(); target_plot
ggsave(plot=target_plot, filename="output/69-wgcna/genes-module0.png", width=6, height=6)
Module 1
target_module<-1
target_genes<-membership%>%filter(module==target_module)%>%select(gene)
target_genes<-unique(target_genes$gene)
# Select columns in datExpr that match any item in target_genes
selected_data <- datExpr[, colnames(datExpr) %in% target_genes, drop = FALSE]
selected_data$exon<-c("2", "3", "4", "5", "6")
selected_data<-selected_data %>%
pivot_longer(cols = -exon,
names_to = "gene",
values_to = "value")%>%
#add color for exposed vs control
mutate(treatment = ifelse(grepl("exp", gene, ignore.case = TRUE), "exposed", "control"))
#add in mean module expression value from module dataframe
target_module2<-"ME1"
selected_module<-plot_MEs%>%
filter(Module==target_module2)
target_plot<-ggplot() +
ggtitle(paste("Module ", target_module))+
geom_hline(yintercept = 0, linetype="dashed", color = "grey")+
geom_line(data=selected_data, aes(x=exon, y=value, group=gene, color=treatment))+
geom_point(data=selected_data, aes(x=exon, y=value, color=treatment)) +
geom_line(data=selected_module, aes(x=exon, y=Mean, group=1), color="black", size=2)+
geom_point(data=selected_module, aes(x=exon, y=Mean), color="black", size=4) +
scale_color_manual(values=c("gray", "red"), name="Treatment")+
#ylim(-5,5) +
ylab("Mean Expression Relative to Exon 1") +
xlab("Exon")+
theme_classic(); target_plot
ggsave(plot=target_plot, filename="output/69-wgcna/genes-module1.png", width=6, height=6)
You can add any other modules here you are interested in!
View genes that change modules
Extract genes that are in module 0 in control and module NOT 0 when exposed. Look at the values for these genes colored by treatment.
control_module<-0
treatment_module<-1
interest_genes<-membership%>%
mutate(treatment = ifelse(grepl("exp", gene, ignore.case = TRUE), "exposed", "control"))%>%
arrange(gene_prefix)%>%
select(module, gene_prefix, treatment)
rownames(interest_genes)<-NULL
interest_genes<-interest_genes%>%
pivot_wider(names_from=treatment, values_from=module)%>%
filter(control==control_module & exposed==treatment_module)
interest_genes<-interest_genes$gene_prefix
length(interest_genes)
#select columns that contain the gene prefix
selected_columns <- datExpr %>%
select(matches(paste(interest_genes, collapse = "|")))
length(selected_columns)
#should be double the length(interest_genes) above
selected_columns$exon<-c("2", "3", "4", "5", "6")
selected_columns<-selected_columns %>%
pivot_longer(cols = -exon,
names_to = "gene",
values_to = "value")%>%
#add color for exposed vs control
mutate(treatment = ifelse(grepl("exp", gene, ignore.case = TRUE), "exposed", "control"))
#plot expression patterns by treatment
target_plot<-ggplot(data=selected_columns, aes(color=treatment, x=exon, y=value)) +
ggtitle(paste("Module", control_module, "to Module ", treatment_module))+
geom_hline(yintercept = 0, linetype="dashed", color = "grey")+
geom_line(aes(group=gene))+
geom_point() +
scale_color_manual(values=c("gray", "red"), name="Treatment")+
#ylim(-5,5) +
ylab("Mean Expression Relative to Exon 1") +
xlab("Exon")+
theme_classic(); target_plot
ggsave(plot=target_plot, filename="output/69-wgcna/genes-module0-module1-treatment.png", width=6, height=6)
Next read in and plot sample-level data for these genes rather than averaged data.