Sequence depth subsampling test with Mcap2020 ITS2 dataset

This post details test for subsampling thresholds in the Montipora capitata early life history timeseries dataset. The purpose of these tests is to determine the minimum read depth to detect differences between treatment groups for future sequencing runs.

Prepare data

Load metadata.

#create metadata and load
Metadata<-read.csv("Mcap2020/Data/ITS2/ITS2_Metadata.csv")
rownames(Metadata) <- Metadata$sample_name
Metadata <- Metadata %>%
  select(code, hpf, group, lifestage)
Metadata <- as.matrix(Metadata)
Metadata <- sample_data(data.frame(Metadata))

Load profile level data.

#load data for taxonomy
tax0 <- read_tsv(
  file  = "Mcap2020/Data/ITS2/mcap.profiles.absolute.abund_and_meta.txt",
  n_max = 6) %>%
  dplyr::select(-2) %>% 
  gather(UID, value, -1) %>% 
  spread(1, value) %>%
  clean_names()

tax1 <- as.matrix(tax0[, -1], dimnames = list(tax0$uid, colnames(tax0[-1])))
rownames(tax1) <- tax0$uid
tax <- tax_table(tax1)

#load data for count matrix 
otu0 <- read_tsv(
  file  = "Mcap2020/Data/ITS2/mcap.profiles.absolute.abund_and_meta.txt", col_names = TRUE) 

list<-c("UID", "sample_name", "8", "9")
colnames(otu0)<-list

otu0<-otu0%>%
  select(-1) %>%
  dplyr::slice(7:n()) %>%
  mutate_at(2:ncol(.), as.numeric)

otu1 <- as.matrix(otu0[, -1])
rownames(otu1) <- otu0$sample_name
otu <- otu_table(otu1, taxa_are_rows = FALSE)

#combine to phyloseq object
coral <- phyloseq(otu, tax, Metadata)
coral

Note that taxa 8 is Clade C and taxa 9 is Clade D.

Next load the data for coral strain level information.

#gather the taxonomy names  
taxnames <- read_tsv(
  file  = "Mcap2020/Data/ITS2/mcap.seqs.absolute.abund_only.txt",
  n_max = 0) %>%
  select(-1) %>%
  names(.)

#extract clade letter  
tax0 <- data_frame(
  DIV = taxnames,
  clade = str_extract(DIV, "[A-Z]")
)

tax1 <- as.matrix(tax0)
rownames(tax1) <- tax0$DIV
tax <- tax_table(tax1)

otu0 <- read_tsv(
  file  = "Mcap2020/Data/ITS2/mcap.seqs.absolute.abund_and_meta.txt") %>%
  select(-1, )

otu1 <- as.matrix(otu0[, 37:113])
rownames(otu1) <- otu0$sample_name
otu <- otu_table(otu1, taxa_are_rows = FALSE)

coralDIV <- phyloseq(otu, tax, Metadata)
coralDIV

Now save the object.

save(coral, coralDIV, file = "Mcap2020/Data/ITS2/ITS2_phyloseq.RData")

Build DIV Data

Load the data.

load("Mcap2020/Data/ITS2/ITS2_phyloseq.RData")
coralDIV

View the number of sequence counts in our dataset.

Samps<-read_tsv(file  = "Mcap2020/Data/ITS2/mcap.seqs.absolute.abund_and_meta.txt")

hist(Samps$post_taxa_id_absolute_symbiodiniaceae_seqs, breaks=30)

Samps$post_taxa_id_absolute_symbiodiniaceae_seqs

We have >10,551 sequences for all of our samples.

Generate tree.

random_tree = rtree(ntaxa(coralDIV), rooted=TRUE, tip.label=taxa_names(coralDIV))

Merge phyloseq data.

phylo_coral_DIV = merge_phyloseq(coralDIV, Metadata, random_tree)
phylo_coral_DIV

Filter out samples with low numbers in groups (n=1 per group) from phyloseq object.

phylo_coral_DIV<-ps_filter(phylo_coral_DIV, lifestage != "Metamorphosed Polyp (183 hpf)")
phylo_coral_DIV<-ps_arrange(phylo_coral_DIV, lifestage)

Reorder lifestage.

levels(as.factor(sample_data(phylo_coral_DIV)$lifestage))

sample_data(phylo_coral_DIV)$lifestage = factor(sample_data(phylo_coral_DIV)$lifestage, levels = c("Egg (1 hpf)","Embryo (5 hpf)","Embryo (38 hpf)","Embryo (65 hpf)", "Larvae (93 hpf)", "Larvae (163 hpf)", "Larvae (231 hpf)", "Metamorphosed Polyp (231 hpf)", "Attached Recruit (183 hpf)", "Attached Recruit (231 hpf)"))

levels(as.factor(sample_data(phylo_coral_DIV)$lifestage))

Assign object to a shorter name.

GP1 = phylo_coral_DIV
GP1

Display sample sequencing depth.

min(sample_sums(GP1))

We will attempt subsampling at the minimum count (10,551 sequences) and in steps decreasing from 10551, 8000, 6000, 4000, 2000, 1000, 500.

DIV level analysis

Does multivariate community composition change across development?

Subsample at 10551

Subsample to even sampling depth to minimum count in dataset (10,551).

GP1_rare<-rarefy_even_depth(GP1, sample.size = min(sample_sums(GP1)),
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

View sample sequencing depth again after subsampling.

sample_sums(GP1_rare)

All samples are now at 10,551 sequences. No OTU’s removed.

Now transform counts to relative abundance expressed as a proportion.

GP1_rare = transform_sample_counts(GP1_rare, function(x) x/sum(x))
metadata <- as(sample_data(GP1_rare), "data.frame")

perm1<-adonis2(phyloseq::distance(GP1_rare, method="bray") ~ lifestage,
       data = metadata)

perm1
adonis2(formula = phyloseq::distance(GP1_rare, method = "bray") ~ lifestage, data = metadata)
          Df SumOfSqs      R2      F Pr(>F)    
lifestage  9 0.089599 0.48785 2.9635  0.001 ***
Residual  28 0.094062 0.51215                  
Total     37 0.183662 1.00000 

There is a significant difference between lifestages as expected.

Subsample at 8000

Subsample to even sampling depth of 8000.

GP1_rare<-rarefy_even_depth(GP1, sample.size = 8000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

10 OTUs were removed.

View sample sequencing depth again after subsampling.

sample_sums(GP1_rare)

All samples are now at 8000 sequences.

Now transform counts to relative abundance expressed as a proportion.

GP1_rare = transform_sample_counts(GP1_rare, function(x) x/sum(x))
metadata <- as(sample_data(GP1_rare), "data.frame")

perm1<-adonis2(phyloseq::distance(GP1_rare, method="bray") ~ lifestage,
       data = metadata)

perm1
adonis2(formula = phyloseq::distance(GP1_rare, method = "bray") ~ lifestage, data = metadata)
          Df SumOfSqs      R2      F Pr(>F)    
lifestage  9 0.092260 0.49173 3.0099  0.001 ***
Residual  28 0.095362 0.50827                  
Total     37 0.187622 1.00000 

Results are the same as 10551 subsampling.

Subsample at 6000

Subsample to even sampling depth of 6000.

GP1_rare<-rarefy_even_depth(GP1, sample.size = 6000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

10 OTUs were removed.

View sample sequencing depth again after subsampling.

sample_sums(GP1_rare)

All samples are now at 6000 sequences.

Now transform counts to relative abundance expressed as a proportion.

GP1_rare = transform_sample_counts(GP1_rare, function(x) x/sum(x))
metadata <- as(sample_data(GP1_rare), "data.frame")

perm1<-adonis2(phyloseq::distance(GP1_rare, method="bray") ~ lifestage,
       data = metadata)

perm1
adonis2(formula = phyloseq::distance(GP1_rare, method = "bray") ~ lifestage, data = metadata)
          Df SumOfSqs     R2      F Pr(>F)    
lifestage  9 0.091519 0.4901 2.9903  0.001 ***
Residual  28 0.095217 0.5099                  
Total     37 0.186736 1.0000  

Results are the same as 10551 subsampling.

Subsample at 4000

Subsample to even sampling depth of 4000.

GP1_rare<-rarefy_even_depth(GP1, sample.size = 4000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

20 OTUs were removed.

View sample sequencing depth again after subsampling.

sample_sums(GP1_rare)

All samples are now at 4000 sequences.

Now transform counts to relative abundance expressed as a proportion.

GP1_rare = transform_sample_counts(GP1_rare, function(x) x/sum(x))
metadata <- as(sample_data(GP1_rare), "data.frame")

perm1<-adonis2(phyloseq::distance(GP1_rare, method="bray") ~ lifestage,
       data = metadata)

perm1
adonis2(formula = phyloseq::distance(GP1_rare, method = "bray") ~ lifestage, data = metadata)
          Df SumOfSqs      R2      F Pr(>F)    
lifestage  9 0.092875 0.45599 2.6077  0.001 ***
Residual  28 0.110804 0.54401                  
Total     37 0.203679 1.00000  

Results are the same as 10551 subsampling. We are losing a few more rare taxa with subsampling.

Subsample at 2000

Subsample to even sampling depth of 2000.

GP1_rare<-rarefy_even_depth(GP1, sample.size = 2000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

10 OTUs were removed.

View sample sequencing depth again after subsampling.

sample_sums(GP1_rare)

All samples are now at 2000 sequences.

Now transform counts to relative abundance expressed as a proportion.

GP1_rare = transform_sample_counts(GP1_rare, function(x) x/sum(x))
metadata <- as(sample_data(GP1_rare), "data.frame")

perm1<-adonis2(phyloseq::distance(GP1_rare, method="bray") ~ lifestage,
       data = metadata)

perm1
adonis2(formula = phyloseq::distance(GP1_rare, method = "bray") ~ lifestage, data = metadata)
          Df SumOfSqs      R2      F Pr(>F)    
lifestage  9  0.11029 0.43532 2.3984  0.001 ***
Residual  28  0.14306 0.56468                  
Total     37  0.25336 1.00000  

Results are the same as 10551 subsampling. Note F statistics are decreasing and we are losing rare DIV’s.

Subsample at 1000

Subsample to even sampling depth of 1000.

GP1_rare<-rarefy_even_depth(GP1, sample.size = 1000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

20 OTUs were removed.

View sample sequencing depth again after subsampling.

sample_sums(GP1_rare)

All samples are now at 1000 sequences.

Now transform counts to relative abundance expressed as a proportion.

GP1_rare = transform_sample_counts(GP1_rare, function(x) x/sum(x))
metadata <- as(sample_data(GP1_rare), "data.frame")

perm1<-adonis2(phyloseq::distance(GP1_rare, method="bray") ~ lifestage,
       data = metadata)

perm1
adonis2(formula = phyloseq::distance(GP1_rare, method = "bray") ~ lifestage, data = metadata)
          Df SumOfSqs      R2      F Pr(>F)    
lifestage  9  0.13350 0.39988 2.0731  0.001 ***
Residual  28  0.20034 0.60012                  
Total     37  0.33384 1.00000   

Results are the same as 10551 subsampling. Note F statistics are decreasing.

Subsample at 500

Subsample to even sampling depth of 500.

GP1_rare<-rarefy_even_depth(GP1, sample.size = 500,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

80 OTUs were removed.

View sample sequencing depth again after subsampling.

sample_sums(GP1_rare)

All samples are now at 500 sequences.

Now transform counts to relative abundance expressed as a proportion.

GP1_rare = transform_sample_counts(GP1_rare, function(x) x/sum(x))
metadata <- as(sample_data(GP1_rare), "data.frame")

perm1<-adonis2(phyloseq::distance(GP1_rare, method="bray") ~ lifestage,
       data = metadata)

perm1
adonis2(formula = phyloseq::distance(GP1_rare, method = "bray") ~ lifestage, data = metadata)
          Df SumOfSqs      R2     F Pr(>F)    
lifestage  9  0.18356 0.35294 1.697  0.001 ***
Residual  28  0.33652 0.64706                 
Total     37  0.52008 1.00000  

Results are the same as 10551 subsampling. Note F statistics are decreasing and we lost more OTUs this time.

Subsample at 100

Subsample to even sampling depth of 100.

GP1_rare<-rarefy_even_depth(GP1, sample.size = 100,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

140 OTUs were removed.

View sample sequencing depth again after subsampling.

sample_sums(GP1_rare)

All samples are now at 100 sequences.

Now transform counts to relative abundance expressed as a proportion.

GP1_rare = transform_sample_counts(GP1_rare, function(x) x/sum(x))
metadata <- as(sample_data(GP1_rare), "data.frame")

perm1<-adonis2(phyloseq::distance(GP1_rare, method="bray") ~ lifestage,
       data = metadata)

perm1
adonis2(formula = phyloseq::distance(GP1_rare, method = "bray") ~ lifestage, data = metadata)
          Df SumOfSqs     R2    F Pr(>F)  
lifestage  9  0.41772 0.2931 1.29   0.03 *
Residual  28  1.00744 0.7069              
Total     37  1.42516 1.0000    

Results show lower P values than from 10551 subsampling. We lost many OTUs. Of course, sampling at this level would be way too low.

Conclusions

We saw that subsampling between 4000-8000 sequences per sample resulted in the same result as the highest subsampling (10,551) that symbiont community is significantly affected by lifestage (P=0.001). For subsampling between 4000-8000, only 10-20 OTUs are lost from the data set, whereas none are lost in the 10551 subsampling setting. This is due to rare taxa being lost. Once subsampling is below 1000, the number of OTUs lost increases to over 100 lost and P-values increase (to P=0.03).

Based on these results, to be able to confidently detect changes in community across development, we need a minimum sequencing depth of 4000-6000. We will next see if this threshold also applies to analyses at the profile level to accurately detect abundance of specific profiles. The highest read depth possible should be used.

These lower read depths would work for this application because the community is dominated by two profiles and is relatively stable. If trying to detect rare taxa or are looking for DIVs exclusive to treatment/species groupings, higher read depth should be used.

Profile level analysis

What is the relative abundance of major profiles?

Prepare data

Create tree.

random_tree_profile = rtree(ntaxa(coral), rooted=TRUE, tip.label=taxa_names(coral))

Merge phyloseq data.

phylo_coral = merge_phyloseq(coral, Metadata, random_tree_profile)
phylo_coral

Filter out samples with low numbers in groups from phyloseq object.

phylo_coral<-ps_filter(phylo_coral, lifestage != "Metamorphosed Polyp (183 hpf)")
phylo_coral<-ps_arrange(phylo_coral, lifestage)

Reorder lifestage.

sample_data(phylo_coral)$lifestage = factor(sample_data(phylo_coral)$lifestage, levels = c("Egg (1 hpf)","Embryo (5 hpf)","Embryo (38 hpf)","Embryo (65 hpf)", "Larvae (93 hpf)", "Larvae (163 hpf)", "Larvae (231 hpf)", "Metamorphosed Polyp (231 hpf)", "Attached Recruit (183 hpf)", "Attached Recruit (231 hpf)"))

Rename data frame.

GP1_profile = phylo_coral
GP1_profile

Minimum depth from all samples is 8657. We will try subsampling at 6000, 4000, 2000, 1000, 500, 100.

Subsample to 8657

Subsample to the minimum sampling depth of 8,657.

GP1_rare_profile<-rarefy_even_depth(GP1_profile, sample.size = min(sample_sums(GP1_profile)),
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(GP1_rare_profile)

All samples now have 8657 sequences.

Transform to relative abundance as a proportion.

GP1_rare_profile = transform_sample_counts(GP1_rare_profile, function(x) x/sum(x))

Prepare relative abundance data.

sample_order<-c("WSH053", "WSH054", "WSH055", "WSH056", "WSH061", "WSH062", "WSH063", "WSH064",  "WSH065", "WSH066", "WSH067", "WSH068",  "WSH069", "WSH070", "WSH071", "WSH072", "WSH073", "WSH074", "WSH075", "WSH076", "WSH077", "WSH078", "WSH079", "WSH080", "WSH081", "WSH082", "WSH083", "WSH084", "WSH057", "WSH058", "WSH059", "WSH060", "WSH046", "WSH047", "WSH048", "WSH049", "WSH050", "WSH051", "WSH052")

t<-plot_bar(GP1_rare_profile, fill="its2_type_profile")
t$data$Sample <- factor(t$data$Sample, levels = sample_order)

rel_abund_data<-t$data

Calculate the average percent abundance for each taxa across all stages.

means_all <- rel_abund_data %>%
  group_by(its2_type_profile) %>%
  summarise(mean=mean(Abundance), sd=sd(Abundance), n=length(Abundance), se=sd/sqrt(n));means_all
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.560 0.0260    38 0.00422
2 D1-D4-D6-D1ab-D17d      0.440 0.0260    38 0.00422

Run ANOVA on ratios.

model<-rel_abund_data%>%
  select(its2_type_profile, Abundance, lifestage, Sample)%>%
  pivot_wider(values_from = Abundance, names_from = its2_type_profile)%>%
  mutate(ratio=`C31-C17d-C21-C21ac-C31a`/`D1-D4-D6-D1ab-D17d`)%>%
  aov(ratio~lifestage, data=.);summary(model)

There is a significant change in C:D ratio across development.

            Df Sum Sq Mean Sq F value Pr(>F)  
lifestage    9 0.3829 0.04254   3.056 0.0112 *
Residuals   28 0.3897 0.01392    

Conduct posthoc testing by lifestage.

emm<-emmeans(model, ~lifestage)
pairs(emm)
Egg (1 hpf) - Larvae (231 hpf)                              -0.3104 0.0834 28  -3.721  0.0254

Subsample to 6000

Subsample to the minimum sampling depth of 6000.

GP1_rare_profile<-rarefy_even_depth(GP1_profile, sample.size = 6000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(GP1_rare_profile)

All samples now have 6000 sequences.

Transform to relative abundance as a proportion.

GP1_rare_profile = transform_sample_counts(GP1_rare_profile, function(x) x/sum(x))

Prepare relative abundance data.

t<-plot_bar(GP1_rare_profile, fill="its2_type_profile")
t$data$Sample <- factor(t$data$Sample, levels = sample_order)

rel_abund_data<-t$data

Calculate the average percent abundance for each taxa across all stages.

means_all <- rel_abund_data %>%
  group_by(its2_type_profile) %>%
  summarise(mean=mean(Abundance), sd=sd(Abundance), n=length(Abundance), se=sd/sqrt(n));means_all
Original: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.560 0.0260    38 0.00422
2 D1-D4-D6-D1ab-D17d      0.440 0.0260    38 0.00422

6000 subsample: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.559 0.0274    38 0.00445
2 D1-D4-D6-D1ab-D17d      0.441 0.0274    38 0.00445

Mean values are the same.

Run ANOVA on ratios.

model<-rel_abund_data%>%
  select(its2_type_profile, Abundance, lifestage, Sample)%>%
  pivot_wider(values_from = Abundance, names_from = its2_type_profile)%>%
  mutate(ratio=`C31-C17d-C21-C21ac-C31a`/`D1-D4-D6-D1ab-D17d`)%>%
  aov(ratio~lifestage, data=.);summary(model)

There is a significant change in C:D ratio across development.

            Df Sum Sq Mean Sq F value Pr(>F)  
lifestage    9 0.4081 0.04535   2.842 0.0164 *
Residuals   28 0.4467 0.01595  

Results are the same as original subsampling, but F value decreased.

Conduct posthoc testing by lifestage.

emm<-emmeans(model, ~lifestage)
pairs(emm)
Egg (1 hpf) - Larvae (231 hpf)                             -0.31767 0.0893 28  -3.557  0.0374

This comparison is still significant as in original subsampling, but P value has increased slightly.

Subsample to 4000

Subsample to the minimum sampling depth of 4000.

GP1_rare_profile<-rarefy_even_depth(GP1_profile, sample.size = 4000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(GP1_rare_profile)

All samples now have 4000 sequences.

Transform to relative abundance as a proportion.

GP1_rare_profile = transform_sample_counts(GP1_rare_profile, function(x) x/sum(x))

Prepare relative abundance data.

t<-plot_bar(GP1_rare_profile, fill="its2_type_profile")
t$data$Sample <- factor(t$data$Sample, levels = sample_order)
rel_abund_data<-t$data

Calculate the average percent abundance for each taxa across all stages.

means_all <- rel_abund_data %>%
  group_by(its2_type_profile) %>%
  summarise(mean=mean(Abundance), sd=sd(Abundance), n=length(Abundance), se=sd/sqrt(n));means_all
Original: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.560 0.0260    38 0.00422
2 D1-D4-D6-D1ab-D17d      0.440 0.0260    38 0.00422

4000 subsample: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.560 0.0298    38 0.00484
2 D1-D4-D6-D1ab-D17d      0.440 0.0298    38 0.00484

Mean values are the same.

Run ANOVA on ratios.

model<-rel_abund_data%>%
  select(its2_type_profile, Abundance, lifestage, Sample)%>%
  pivot_wider(values_from = Abundance, names_from = its2_type_profile)%>%
  mutate(ratio=`C31-C17d-C21-C21ac-C31a`/`D1-D4-D6-D1ab-D17d`)%>%
  aov(ratio~lifestage, data=.);summary(model)

There is a significant change in C:D ratio across development.

            Df Sum Sq Mean Sq F value  Pr(>F)   
lifestage    9 0.5366 0.05962   3.633 0.00412 **
Residuals   28 0.4595 0.01641 

Results are the same as original subsampling, but P value actually decreased.

Conduct posthoc testing by lifestage.

emm<-emmeans(model, ~lifestage)
pairs(emm)
 Egg (1 hpf) - Larvae (231 hpf)                              -0.3359 0.0906 28  -3.708  0.0262
Egg (1 hpf) - Larvae (93 hpf)                               -0.3157 0.0906 28  -3.486  0.0440

This comparison is still significant as in original subsampling, and one other egg to larvae comparison is also significant.

Subsample to 2000

Subsample to the minimum sampling depth of 2000.

GP1_rare_profile<-rarefy_even_depth(GP1_profile, sample.size = 2000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(GP1_rare_profile)

All samples now have 2000 sequences.

Transform to relative abundance as a proportion.

GP1_rare_profile = transform_sample_counts(GP1_rare_profile, function(x) x/sum(x))

Prepare relative abundance data.

t<-plot_bar(GP1_rare_profile, fill="its2_type_profile")
t$data$Sample <- factor(t$data$Sample, levels = sample_order)
rel_abund_data<-t$data

Calculate the average percent abundance for each taxa across all stages.

means_all <- rel_abund_data %>%
  group_by(its2_type_profile) %>%
  summarise(mean=mean(Abundance), sd=sd(Abundance), n=length(Abundance), se=sd/sqrt(n));means_all
Original: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.560 0.0260    38 0.00422
2 D1-D4-D6-D1ab-D17d      0.440 0.0260    38 0.00422

2000 subsample: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.559 0.0266    38 0.00431
2 D1-D4-D6-D1ab-D17d      0.441 0.0266    38 0.00431

Mean values are the same.

Run ANOVA on ratios.

model<-rel_abund_data%>%
  select(its2_type_profile, Abundance, lifestage, Sample)%>%
  pivot_wider(values_from = Abundance, names_from = its2_type_profile)%>%
  mutate(ratio=`C31-C17d-C21-C21ac-C31a`/`D1-D4-D6-D1ab-D17d`)%>%
  aov(ratio~lifestage, data=.);summary(model)

There is a significant change in C:D ratio across development.

            Df Sum Sq Mean Sq F value Pr(>F)  
lifestage    9 0.3762 0.04180   2.934 0.0139 *
Residuals   28 0.3989 0.01425  

Results are the same as original subsampling, but F value decreased slightly from original.

Conduct posthoc testing by lifestage.

emm<-emmeans(model, ~lifestage)
pairs(emm)
Egg (1 hpf) - Larvae (93 hpf)                               -0.3015 0.0844 28  -3.572  0.0361

One posthoc result is significant from egg to larvae.

Subsample to 1000

Subsample to the minimum sampling depth of 1000.

GP1_rare_profile<-rarefy_even_depth(GP1_profile, sample.size = 1000,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(GP1_rare_profile)

All samples now have 2000 sequences.

Transform to relative abundance as a proportion.

GP1_rare_profile = transform_sample_counts(GP1_rare_profile, function(x) x/sum(x))

Prepare relative abundance data.

t<-plot_bar(GP1_rare_profile, fill="its2_type_profile")
t$data$Sample <- factor(t$data$Sample, levels = sample_order)
rel_abund_data<-t$data

Calculate the average percent abundance for each taxa across all stages.

means_all <- rel_abund_data %>%
  group_by(its2_type_profile) %>%
  summarise(mean=mean(Abundance), sd=sd(Abundance), n=length(Abundance), se=sd/sqrt(n));means_all
Original: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.560 0.0260    38 0.00422
2 D1-D4-D6-D1ab-D17d      0.440 0.0260    38 0.00422

1000 subsample: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.561 0.0286    38 0.00464
2 D1-D4-D6-D1ab-D17d      0.439 0.0286    38 0.00464

Mean values are the same.

Run ANOVA on ratios.

model<-rel_abund_data%>%
  select(its2_type_profile, Abundance, lifestage, Sample)%>%
  pivot_wider(values_from = Abundance, names_from = its2_type_profile)%>%
  mutate(ratio=`C31-C17d-C21-C21ac-C31a`/`D1-D4-D6-D1ab-D17d`)%>%
  aov(ratio~lifestage, data=.);summary(model)

There is a significant change in C:D ratio across development.

            Df Sum Sq Mean Sq F value Pr(>F)  
lifestage    9 0.4007 0.04452   2.695 0.0214 *
Residuals   28 0.4625 0.01652   

Results are the same as original subsampling, but P value increased slightly from original.

Conduct posthoc testing by lifestage.

emm<-emmeans(model, ~lifestage)
pairs(emm)
Egg (1 hpf) - Larvae (231 hpf)                             -0.33634 0.0909 28  -3.701  0.0267

Comparison is still significant from original subsampling.

Subsample to 500

Subsample to the minimum sampling depth of 500.

GP1_rare_profile<-rarefy_even_depth(GP1_profile, sample.size = 500,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(GP1_rare_profile)

All samples now have 500 sequences.

Transform to relative abundance as a proportion.

GP1_rare_profile = transform_sample_counts(GP1_rare_profile, function(x) x/sum(x))

Prepare relative abundance data.

t<-plot_bar(GP1_rare_profile, fill="its2_type_profile")
t$data$Sample <- factor(t$data$Sample, levels = sample_order)
rel_abund_data<-t$data

Calculate the average percent abundance for each taxa across all stages.

means_all <- rel_abund_data %>%
  group_by(its2_type_profile) %>%
  summarise(mean=mean(Abundance), sd=sd(Abundance), n=length(Abundance), se=sd/sqrt(n));means_all
Original: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.560 0.0260    38 0.00422
2 D1-D4-D6-D1ab-D17d      0.440 0.0260    38 0.00422

500 subsample: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.563 0.0384    38 0.00623
2 D1-D4-D6-D1ab-D17d      0.437 0.0384    38 0.00623

Mean values are just slightly different from original by about 0.003.

Run ANOVA on ratios.

model<-rel_abund_data%>%
  select(its2_type_profile, Abundance, lifestage, Sample)%>%
  pivot_wider(values_from = Abundance, names_from = its2_type_profile)%>%
  mutate(ratio=`C31-C17d-C21-C21ac-C31a`/`D1-D4-D6-D1ab-D17d`)%>%
  aov(ratio~lifestage, data=.);summary(model)

There is a significant change in C:D ratio across development.

            Df Sum Sq Mean Sq F value Pr(>F)  
lifestage    9 0.6824 0.07583   2.277 0.0463 *
Residuals   28 0.9325 0.03330   

Results are the same as original subsampling, but P value greatly increased slightly from original and error bars on plot are larger.

Conduct posthoc testing by lifestage.

emm<-emmeans(model, ~lifestage)
pairs(emm)

No significant post hoc comparisons.

Subsample to 100

Subsample to the minimum sampling depth of 100.

GP1_rare_profile<-rarefy_even_depth(GP1_profile, sample.size = 100,
  rngseed = TRUE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(GP1_rare_profile)

All samples now have 100 sequences.

Transform to relative abundance as a proportion.

GP1_rare_profile = transform_sample_counts(GP1_rare_profile, function(x) x/sum(x))

Prepare relative abundance data.

t<-plot_bar(GP1_rare_profile, fill="its2_type_profile")
t$data$Sample <- factor(t$data$Sample, levels = sample_order)
rel_abund_data<-t$data

Calculate the average percent abundance for each taxa across all stages.

means_all <- rel_abund_data %>%
  group_by(its2_type_profile) %>%
  summarise(mean=mean(Abundance), sd=sd(Abundance), n=length(Abundance), se=sd/sqrt(n));means_all
Original: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.560 0.0260    38 0.00422
2 D1-D4-D6-D1ab-D17d      0.440 0.0260    38 0.00422

100 subsample: 
  its2_type_profile        mean     sd     n      se
  <chr>                   <dbl>  <dbl> <int>   <dbl>
1 C31-C17d-C21-C21ac-C31a 0.572 0.0519    38 0.00842
2 D1-D4-D6-D1ab-D17d      0.428 0.0519    38 0.00842

Mean values are different from original by about 0.02.

Run ANOVA on ratios.

model<-rel_abund_data%>%
  select(its2_type_profile, Abundance, lifestage, Sample)%>%
  pivot_wider(values_from = Abundance, names_from = its2_type_profile)%>%
  mutate(ratio=`C31-C17d-C21-C21ac-C31a`/`D1-D4-D6-D1ab-D17d`)%>%
  aov(ratio~lifestage, data=.);summary(model)

No significant change in CD ratio across development.

            Df Sum Sq Mean Sq F value Pr(>F)
lifestage    9 0.9716 0.10796   1.093  0.399
Residuals   28 2.7645 0.09873    

Change in CD ratio could no longer be detected.

Conduct posthoc testing by lifestage.

emm<-emmeans(model, ~lifestage)
pairs(emm)

No significant post hoc comparisons.

Conclusions

Detection of changes in C:D ratio across development were lost at <1000 sequencing depth. Significance decreased and variability increased at <4000 sequencing depth.

Overall conclusions

Overall, we found no loss in capacity to detect DIV community composition changes across development at >4000 sequencing depth. We also found no loss in capacity to detect changes in profile level shifts across development at sequencing depth of >4000.

For this application in this species, the minimum sequencing depth should be 4000-6000.

Written on March 26, 2024