WGCNA Attempt 2 for ceabigr project exon expression data

Today I played around with another attempt at WGCNA analyses to look at exon expression data for the ceabigr project.

This time, we are trying data formatted with sample_foldchange in rows and gene in columns.

See my previous post here for more explanation.

Coding and results

#library(kableExtra)
# library(DESeq2)
# library(pheatmap)
# library(RColorBrewer)
# library(data.table)
#library(DT)
# library(Biostrings)
#library(methylKit)
library(WGCNA)
library(data.table) # for data manipulation
library(tidyverse)

knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = TRUE,         # Evaluate code chunks
  warning = TRUE,     # Hide warnings
  message = TRUE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)

Read in data

Read in data that now has sample_fold in rows and genes in columns with values representing fold change from the first exon relative to each subsequent exon.

Running for just females for now.

#setwd("Projects/ceabigr")

#datExpr <- fread("output/68-female-exon-fold/logfc.txt")
datExpr <- fread("output/72-exon-data-rfmt/female_exon_tf.csv")

Set row names and remove character column.

rownames(datExpr) <- datExpr$SampleID_fold
sample_vector<-datExpr$SampleID_fold
datExpr <- datExpr[ , -1, with = FALSE]

Remove rows that are a sum of 0. These are the fold 1 rows.

# Find rows with sum not equal to 0
nonZeroSumRows <- rowSums(datExpr, na.rm=TRUE) != 0

# Subset the matrix using logical indexing
datExpr <- datExpr[nonZeroSumRows, ]

Remove columns that have na’s (removed genes not present in every sample).

datExpr<-datExpr %>%
    select_if(~ !any(is.na(.)))

We had 13,280 genes before, and now have 11,270 genes.

Choose a soft power.

powers = c(1:35)
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
plot(sft$fitIndices[,1], sft$fitIndices[,3], pch = 19, xlab="Soft Threshold (power)", ylab="scale free topology model fit", type="n")
text(sft$fitIndices[,1], sft$fitIndices[,3], labels=powers, cex=0.5)
abline(h = 0.9, col = "red")

It’s having a problem estimating variance to generate the soft powers… It doesn’t reach the 0.9 threshold. We will have to return to this… Unsure what it means.

Blockwise modules network construction

Run blockwise modules with a signed network.

picked_power =  10
temp_cor <- cor       
cor <- WGCNA::cor                                             # Force it to use WGCNA cor function (fix a namespace conflict issue)
netwk <- blockwiseModules(datExpr,                         # <= input here

                          # == Adjacency Function ==
                          power = picked_power,               # <= power here
                          networkType = "signed",

                          # == Tree and Block Options ==
                          deepSplit = 2,
                          pamRespectsDendro = F,
                          # detectCutHeight = 0.75,
                          minModuleSize = 1000,                  
                          maxBlockSize = 10000,

                          # == Module Adjustments ==
                          mergeCutHeight = 0.05,
                          reassignThreshold = 1e-6,
                          minCoreKME = 0.5,
                          minKMEtoStay = 0.3,

                          # == TOM == Archive the run results in TOM file (saves time) but it doesn't save a file
                          saveTOMs = F,
                          saveTOMFileBase = "ER",

                          # == Output Options
                          numericLabels = T,
                          verbose = 3)

cor <- temp_cor     # Return cor function to original namespace

# Identify labels as numbers 
mergedColors = netwk$colors
# Plot the dendrogram and the module colors underneath

table(mergedColors)

membership<-as.data.frame(mergedColors)

membership$gene<-rownames(membership)

names(membership)<-c("module", "gene")

Plot module eigengene level

Next plot eigengene levels

#Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = mergedColors, softPower = 8)
MEs = MEList$eigengenes

ncol(MEs) #How many modules do we have now?

table(mergedColors)
table<-as.data.frame(table(mergedColors))
table

The table shows number of genes in each module.

mergedColors Freq 1 0 5520 2 1 3511 3 2 2239

There are three modules.

Plot module expression across exon location.

head(MEs)
names(MEs)
Strader_MEs <- MEs
#Strader_MEs$exon <- c("2", "3", "4", "5", "6")
cleaned_vector <- sample_vector[!grepl("fold1", sample_vector)]
Strader_MEs$sample <- cleaned_vector
head(Strader_MEs)

Strader_MEs <- separate(Strader_MEs, col = sample, into = c("sample", "fold"), sep = "_", remove = FALSE)
head(Strader_MEs)

plot_MEs<-Strader_MEs%>%
  gather(., key="Module", value="Mean", ME0:ME2)

First, assign treatment by sample ID.

meta<-read_csv("data/adult-meta.csv")

plot_MEs$treatment<-meta$Treatment[match(plot_MEs$sample, meta$OldSample.ID)]

Plot module expression across exon location.

library(ggplot2)
library(tidyverse)

expression_plot<-plot_MEs%>%
  group_by(Module, fold) %>%
  
  ggplot(aes(x=fold, y=Mean, color=treatment)) +
  facet_wrap(~Module)+
  geom_hline(yintercept = 0, linetype="dashed", color = "grey")+
  geom_point() +
  scale_color_manual(values=c("gray", "darkred"))+
  #geom_line(group=1)+
  #ylim(-0.5,1) +
  ylab("Mean Module Eigenegene") +
  xlab("Exon fold change")+
  theme_classic()+
  theme(panel.border = element_rect(fill=NA)); expression_plot

ggsave(plot=expression_plot, filename="output/69-wgcna/module-expression.png", width=10, height=6)

It looks like Module 0 is genes that peak in the middle (exon 3-4); Module 1 are genes that have higher expression of earlier exons (exon 2-3) and Module 2 are genes that increase with higher expression of later exons (exons 5-6).

Next steps

I will next need to figure out which genes change module assignment by treatment. Currently, I don’t have a way to do this because each gene is assigned to one module. We do not have separate columns for the gene value from control and that same gene from treatment as we did last time.

Written on February 27, 2024